Zelda64Recomp/include/rsp_vu.h

205 lines
10 KiB
C++

// This file is modified from the Ares N64 emulator core. Ares can
// be found at https://github.com/ares-emulator/ares. The original license
// for this portion of Ares is as follows:
// ----------------------------------------------------------------------
// ares
//
// Copyright(c) 2004 - 2021 ares team, Near et al
//
// Permission to use, copy, modify, and /or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright noticeand this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS.IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
// ----------------------------------------------------------------------
#include <cstdint>
#define ARCHITECTURE_AMD64
#define ARCHITECTURE_SUPPORTS_SSE4_1 1
#if defined(ARCHITECTURE_AMD64)
#include <nmmintrin.h>
using v128 = __m128i;
#elif defined(ARCHITECTURE_ARM64)
#include <sse2neon.h>
using v128 = __m128i;
#endif
namespace Accuracy {
namespace RSP {
#if ARCHITECTURE_SUPPORTS_SSE4_1
constexpr bool SISD = false;
constexpr bool SIMD = true;
#else
constexpr bool SISD = true;
constexpr bool SIMD = false;
#endif
}
}
using u8 = uint8_t;
using s8 = int8_t;
using u16 = uint16_t;
using s16 = int16_t;
using u32 = uint32_t;
using s32 = int32_t;
using u64 = uint64_t;
using s64 = int64_t;
using uint128_t = uint64_t[2];
template<u32 bits> inline auto sclamp(s64 x) -> s64 {
enum : s64 { b = 1ull << (bits - 1), m = b - 1 };
return (x > m) ? m : (x < -b) ? -b : x;
}
template<u32 bits> inline auto sclip(s64 x) -> s64 {
enum : u64 { b = 1ull << (bits - 1), m = b * 2 - 1 };
return ((x & m) ^ b) - b;
}
struct RSP {
using r32 = uint32_t;
using cr32 = const r32;
union r128 {
struct { uint64_t u128[2]; };
#if ARCHITECTURE_SUPPORTS_SSE4_1
struct { __m128i v128; };
operator __m128i() const { return v128; }
auto operator=(__m128i value) { v128 = value; }
#endif
auto byte(u32 index) -> uint8_t& { return ((uint8_t*)&u128)[15 - index]; }
auto byte(u32 index) const -> uint8_t { return ((uint8_t*)&u128)[15 - index]; }
auto element(u32 index) -> uint16_t& { return ((uint16_t*)&u128)[7 - index]; }
auto element(u32 index) const -> uint16_t { return ((uint16_t*)&u128)[7 - index]; }
auto u8(u32 index) -> uint8_t& { return ((uint8_t*)&u128)[15 - index]; }
auto u8(u32 index) const -> uint8_t { return ((uint8_t*)&u128)[15 - index]; }
auto s16(u32 index) -> int16_t& { return ((int16_t*)&u128)[7 - index]; }
auto s16(u32 index) const -> int16_t { return ((int16_t*)&u128)[7 - index]; }
auto u16(u32 index) -> uint16_t& { return ((uint16_t*)&u128)[7 - index]; }
auto u16(u32 index) const -> uint16_t { return ((uint16_t*)&u128)[7 - index]; }
//VCx registers
auto get(u32 index) const -> bool { return u16(index) != 0; }
auto set(u32 index, bool value) -> bool { return u16(index) = 0 - value, value; }
//vu-registers.cpp
inline auto operator()(u32 index) const -> r128;
};
using cr128 = const r128;
struct VU {
r128 r[32];
r128 acch, accm, accl;
r128 vcoh, vcol; //16-bit little endian
r128 vcch, vccl; //16-bit little endian
r128 vce; // 8-bit little endian
s16 divin;
s16 divout;
bool divdp;
} vpu;
static constexpr r128 zero{0};
static constexpr r128 invert{(uint64_t)-1, (uint64_t)-1};
inline auto accumulatorGet(u32 index) const -> u64;
inline auto accumulatorSet(u32 index, u64 value) -> void;
inline auto accumulatorSaturate(u32 index, bool slice, u16 negative, u16 positive) const -> u16;
inline auto CFC2(r32& rt, u8 rd) -> void;
inline auto CTC2(cr32& rt, u8 rd) -> void;
template<u8 e> inline auto LBV(r128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto LDV(r128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto LFV(r128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto LHV(r128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto LLV(r128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto LPV(r128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto LQV(r128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto LRV(r128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto LSV(r128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto LTV(u8 vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto LUV(r128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto LWV(r128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto MFC2(r32& rt, cr128& vs) -> void;
template<u8 e> inline auto MTC2(cr32& rt, r128& vs) -> void;
template<u8 e> inline auto SBV(cr128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto SDV(cr128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto SFV(cr128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto SHV(cr128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto SLV(cr128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto SPV(cr128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto SQV(cr128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto SRV(cr128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto SSV(cr128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto STV(u8 vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto SUV(cr128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto SWV(cr128& vt, cr32& rs, s8 imm) -> void;
template<u8 e> inline auto VABS(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VADD(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VADDC(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VAND(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VCH(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VCL(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VCR(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VEQ(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VGE(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VLT(r128& vd, cr128& vs, cr128& vt) -> void;
template<bool U, u8 e>
inline auto VMACF(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VMACF(r128& vd, cr128& vs, cr128& vt) -> void { VMACF<0, e>(vd, vs, vt); }
template<u8 e> inline auto VMACU(r128& vd, cr128& vs, cr128& vt) -> void { VMACF<1, e>(vd, vs, vt); }
inline auto VMACQ(r128& vd) -> void;
template<u8 e> inline auto VMADH(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VMADL(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VMADM(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VMADN(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VMOV(r128& vd, u8 de, cr128& vt) -> void;
template<u8 e> inline auto VMRG(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VMUDH(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VMUDL(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VMUDM(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VMUDN(r128& vd, cr128& vs, cr128& vt) -> void;
template<bool U, u8 e>
inline auto VMULF(r128& rd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VMULF(r128& rd, cr128& vs, cr128& vt) -> void { VMULF<0, e>(rd, vs, vt); }
template<u8 e> inline auto VMULU(r128& rd, cr128& vs, cr128& vt) -> void { VMULF<1, e>(rd, vs, vt); }
template<u8 e> inline auto VMULQ(r128& rd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VNAND(r128& rd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VNE(r128& vd, cr128& vs, cr128& vt) -> void;
inline auto VNOP() -> void;
template<u8 e> inline auto VNOR(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VNXOR(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VOR(r128& vd, cr128& vs, cr128& vt) -> void;
template<bool L, u8 e>
inline auto VRCP(r128& vd, u8 de, cr128& vt) -> void;
template<u8 e> inline auto VRCP(r128& vd, u8 de, cr128& vt) -> void { VRCP<0, e>(vd, de, vt); }
template<u8 e> inline auto VRCPL(r128& vd, u8 de, cr128& vt) -> void { VRCP<1, e>(vd, de, vt); }
template<u8 e> inline auto VRCPH(r128& vd, u8 de, cr128& vt) -> void;
template<bool D, u8 e>
inline auto VRND(r128& vd, u8 vs, cr128& vt) -> void;
template<u8 e> inline auto VRNDN(r128& vd, u8 vs, cr128& vt) -> void { VRND<0, e>(vd, vs, vt); }
template<u8 e> inline auto VRNDP(r128& vd, u8 vs, cr128& vt) -> void { VRND<1, e>(vd, vs, vt); }
template<bool L, u8 e>
inline auto VRSQ(r128& vd, u8 de, cr128& vt) -> void;
template<u8 e> inline auto VRSQ(r128& vd, u8 de, cr128& vt) -> void { VRSQ<0, e>(vd, de, vt); }
template<u8 e> inline auto VRSQL(r128& vd, u8 de, cr128& vt) -> void { VRSQ<1, e>(vd, de, vt); }
template<u8 e> inline auto VRSQH(r128& vd, u8 de, cr128& vt) -> void;
template<u8 e> inline auto VSAR(r128& vd, cr128& vs) -> void;
template<u8 e> inline auto VSUB(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VSUBC(r128& vd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VXOR(r128& rd, cr128& vs, cr128& vt) -> void;
template<u8 e> inline auto VZERO(r128& rd, cr128& vs, cr128& vt) -> void;
};