8 Commits

Author SHA1 Message Date
DeaTh-G
9ff80d8321 make store instructions check for mmio 2025-07-07 20:35:10 +02:00
DeaTh-G
830be1f69a fix vaddsws implementation 2025-07-07 20:35:10 +02:00
DeaTh-G
a5d6382975 add remaining altivec instructions 2025-07-07 20:35:08 +02:00
DeaTh-G
1d452c60a8 add vpkuhus implementation 2025-07-07 20:33:33 +02:00
DeaTh-G
cea0b2fc38 Fix instruction implementations based on unit tests 2025-07-07 20:33:30 +02:00
DeaTh-G
f6193ebe43 add more basic instructions 2025-07-07 20:31:58 +02:00
DeaTh-G
f23d22bc7f Fix indexing on certain instructions 2025-07-07 20:31:57 +02:00
DeaTh-G
847b750786 Add more instructions regarding Bakugan Battle Brawlers 2025-07-07 20:31:57 +02:00
8 changed files with 480 additions and 210 deletions

3
.gitmodules vendored
View File

@@ -13,6 +13,3 @@
[submodule "thirdparty/tiny-AES-c"]
path = thirdparty/tiny-AES-c
url = https://github.com/kokke/tiny-AES-c.git
[submodule "thirdparty/simde"]
path = thirdparty/simde
url = https://github.com/simd-everywhere/simde-no-tests.git

View File

@@ -1,6 +1,6 @@
# XenonRecomp
XenonRecomp is a tool that converts Xbox 360 executables into C++ code, which can then be recompiled for any platform.
XenonRecomp is a tool that converts Xbox 360 executables into C++ code, which can then be recompiled for any platform. Currently, it only supports x86 platforms due to the use of x86 intrinsics.
This project was heavily inspired by [N64: Recompiled](https://github.com/N64Recomp/N64Recomp), a similar tool for N64 executables.
@@ -20,7 +20,7 @@ Vector registers' endianness handling is more complicated. Instead of swapping i
The FPU expects denormalized numbers to remain unmodified, while VMX instructions always flush them. This is managed by storing the current floating-point state in the CPU state struct and enabling or disabling denormal flushing as necessary before executing each instruction.
Most VMX instructions are implemented using x86 intrinsics. Support for ARM64 is implemented using [SIMD Everywhere](https://github.com/simd-everywhere/simde).
Most VMX instructions are implemented using x86 intrinsics. Luckily, the number of AVX intrinsics used is relatively low, so adding support for other architectures using libraries like [SIMD Everywhere](https://github.com/simd-everywhere/simde) might be possible.
### MMIO

View File

@@ -16,4 +16,4 @@
#include <xbox.h>
#include <xxhash.h>
#include <fmt/core.h>
#include <x86/sse.h>
#include <xmmintrin.h>

File diff suppressed because it is too large Load Diff

View File

@@ -17,9 +17,8 @@ target_compile_definitions(XenonUtils
)
target_include_directories(XenonUtils
PUBLIC
PUBLIC
.
"${THIRDPARTY_ROOT}/simde"
PRIVATE
"${THIRDPARTY_ROOT}/libmspack/libmspack/mspack"
"${THIRDPARTY_ROOT}/tiny-AES-c"

View File

@@ -12,13 +12,13 @@
#include <cstdlib>
#include <cstring>
#include <x86/avx.h>
#include <x86/sse.h>
#include <x86/sse4.1.h>
#include <x86intrin.h>
// SSE3 constants are missing from simde
#ifndef _MM_DENORMALS_ZERO_MASK
#define _MM_DENORMALS_ZERO_MASK 0x0040
#ifdef _WIN32
#include <intrin.h>
#else
#include <xmmintrin.h>
#include <smmintrin.h>
#endif
#define PPC_JOIN(x, y) x##y
@@ -172,18 +172,18 @@ struct PPCCRRegister
eq = !un && (left == right);
}
inline void setFromMask(simde__m128 mask, int imm) noexcept
inline void setFromMask(__m128 mask, int imm) noexcept
{
int m = simde_mm_movemask_ps(mask);
int m = _mm_movemask_ps(mask);
lt = m == imm; // all equal
gt = 0;
eq = m == 0; // none equal
so = 0;
}
inline void setFromMask(simde__m128i mask, int imm) noexcept
inline void setFromMask(__m128i mask, int imm) noexcept
{
int m = simde_mm_movemask_epi8(mask);
int m = _mm_movemask_epi8(mask);
lt = m == imm; // all equal
gt = 0;
eq = m == 0; // none equal
@@ -215,71 +215,34 @@ struct PPCFPSCRRegister
{
uint32_t csr;
static constexpr size_t GuestToHost[] = { _MM_ROUND_NEAREST, _MM_ROUND_TOWARD_ZERO, _MM_ROUND_UP, _MM_ROUND_DOWN };
static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO };
// simde does not handle denormal flags, so we need to implement per-arch.
#if defined(__x86_64__) || defined(_M_X64)
static constexpr size_t RoundShift = 13;
static constexpr size_t RoundMask = SIMDE_MM_ROUND_MASK;
static constexpr size_t FlushMask = SIMDE_MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
static constexpr size_t GuestToHost[] = { SIMDE_MM_ROUND_NEAREST, SIMDE_MM_ROUND_TOWARD_ZERO, SIMDE_MM_ROUND_UP, SIMDE_MM_ROUND_DOWN };
inline uint32_t getcsr() noexcept
{
return simde_mm_getcsr();
}
inline void setcsr(uint32_t csr) noexcept
{
simde_mm_setcsr(csr);
}
#elif defined(__aarch64__) || defined(_M_ARM64)
// RMode
static constexpr size_t RoundShift = 22;
static constexpr size_t RoundMask = 3 << RoundShift;
// FZ and FZ16
static constexpr size_t FlushMask = (1 << 19) | (1 << 24);
// Nearest, Zero, -Infinity, -Infinity
static constexpr size_t GuestToHost[] = { 0 << RoundShift, 3 << RoundShift, 1 << RoundShift, 2 << RoundShift };
inline uint32_t getcsr() noexcept
{
uint64_t csr;
__asm__ __volatile__("mrs %0, fpcr" : "=r"(csr));
return csr;
}
inline void setcsr(uint32_t csr) noexcept
{
__asm__ __volatile__("msr fpcr, %0" : : "r"(csr));
}
#else
# error "Missing implementation for FPSCR."
#endif
inline uint32_t loadFromHost() noexcept
{
csr = getcsr();
return HostToGuest[(csr & RoundMask) >> RoundShift];
csr = _mm_getcsr();
return HostToGuest[(csr & _MM_ROUND_MASK) >> 13];
}
inline void storeFromGuest(uint32_t value) noexcept
{
csr &= ~RoundMask;
csr &= ~_MM_ROUND_MASK;
csr |= GuestToHost[value & PPC_ROUND_MASK];
setcsr(csr);
_mm_setcsr(csr);
}
static constexpr size_t FlushMask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
inline void enableFlushModeUnconditional() noexcept
{
csr |= FlushMask;
setcsr(csr);
_mm_setcsr(csr);
}
inline void disableFlushModeUnconditional() noexcept
{
csr &= ~FlushMask;
setcsr(csr);
_mm_setcsr(csr);
}
inline void enableFlushMode() noexcept
@@ -287,7 +250,7 @@ struct PPCFPSCRRegister
if ((csr & FlushMask) != FlushMask) [[unlikely]]
{
csr |= FlushMask;
setcsr(csr);
_mm_setcsr(csr);
}
}
@@ -296,7 +259,7 @@ struct PPCFPSCRRegister
if ((csr & FlushMask) != 0) [[unlikely]]
{
csr &= ~FlushMask;
setcsr(csr);
_mm_setcsr(csr);
}
}
};
@@ -624,80 +587,81 @@ inline uint8_t VectorShiftTableR[] =
0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
};
inline simde__m128i simde_mm_adds_epu32(simde__m128i a, simde__m128i b)
inline __m128i _mm_adds_epu32(__m128i a, __m128i b)
{
return simde_mm_add_epi32(a, simde_mm_min_epu32(simde_mm_xor_si128(a, simde_mm_cmpeq_epi32(a, a)), b));
return _mm_add_epi32(a, _mm_min_epu32(_mm_xor_si128(a, _mm_cmpeq_epi32(a, a)), b));
}
inline simde__m128i simde_mm_avg_epi8(simde__m128i a, simde__m128i b)
inline __m128i _mm_avg_epi8(__m128i a, __m128i b)
{
simde__m128i c = simde_mm_set1_epi8(char(128));
return simde_mm_xor_si128(c, simde_mm_avg_epu8(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
__m128i c = _mm_set1_epi8(char(128));
return _mm_xor_si128(c, _mm_avg_epu8(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
}
inline simde__m128i simde_mm_avg_epi16(simde__m128i a, simde__m128i b)
inline __m128i _mm_avg_epi16(__m128i a, __m128i b)
{
simde__m128i c = simde_mm_set1_epi16(short(32768));
return simde_mm_xor_si128(c, simde_mm_avg_epu16(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
__m128i c = _mm_set1_epi16(short(32768));
return _mm_xor_si128(c, _mm_avg_epu16(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
}
inline simde__m128 simde_mm_cvtepu32_ps_(simde__m128i src1)
inline __m128 _mm_cvtepu32_ps_(__m128i src1)
{
simde__m128i xmm1 = simde_mm_add_epi32(src1, simde_mm_set1_epi32(127));
simde__m128i xmm0 = simde_mm_slli_epi32(src1, 31 - 8);
xmm0 = simde_mm_srli_epi32(xmm0, 31);
xmm0 = simde_mm_add_epi32(xmm0, xmm1);
xmm0 = simde_mm_srai_epi32(xmm0, 8);
xmm0 = simde_mm_add_epi32(xmm0, simde_mm_set1_epi32(0x4F800000));
simde__m128 xmm2 = simde_mm_cvtepi32_ps(src1);
return simde_mm_blendv_ps(xmm2, simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(src1));
__m128i xmm1 = _mm_add_epi32(src1, _mm_set1_epi32(127));
__m128i xmm0 = _mm_slli_epi32(src1, 31 - 8);
xmm0 = _mm_srli_epi32(xmm0, 31);
xmm0 = _mm_add_epi32(xmm0, xmm1);
xmm0 = _mm_srai_epi32(xmm0, 8);
xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(0x4F800000));
__m128 xmm2 = _mm_cvtepi32_ps(src1);
return _mm_blendv_ps(xmm2, _mm_castsi128_ps(xmm0), _mm_castsi128_ps(src1));
}
inline simde__m128i simde_mm_perm_epi8_(simde__m128i a, simde__m128i b, simde__m128i c)
inline __m128i _mm_perm_epi8_(__m128i a, __m128i b, __m128i c)
{
simde__m128i d = simde_mm_set1_epi8(0xF);
simde__m128i e = simde_mm_sub_epi8(d, simde_mm_and_si128(c, d));
return simde_mm_blendv_epi8(simde_mm_shuffle_epi8(a, e), simde_mm_shuffle_epi8(b, e), simde_mm_slli_epi32(c, 3));
__m128i d = _mm_set1_epi8(0xF);
__m128i e = _mm_sub_epi8(d, _mm_and_si128(c, d));
return _mm_blendv_epi8(_mm_shuffle_epi8(a, e), _mm_shuffle_epi8(b, e), _mm_slli_epi32(c, 3));
}
inline simde__m128i simde_mm_cmpgt_epu8(simde__m128i a, simde__m128i b)
inline __m128i _mm_cmpgt_epu8(__m128i a, __m128i b)
{
simde__m128i c = simde_mm_set1_epi8(char(128));
return simde_mm_cmpgt_epi8(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
__m128i c = _mm_set1_epi8(char(128));
return _mm_cmpgt_epi8(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
}
inline simde__m128i simde_mm_cmpgt_epu16(simde__m128i a, simde__m128i b)
inline __m128i _mm_cmpgt_epu16(__m128i a, __m128i b)
{
simde__m128i c = simde_mm_set1_epi16(short(32768));
return simde_mm_cmpgt_epi16(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
__m128i c = _mm_set1_epi16(short(32768));
return _mm_cmpgt_epi16(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
}
inline simde__m128i simde_mm_vctsxs(simde__m128 src1)
inline __m128i _mm_vctsxs(__m128 src1)
{
simde__m128 xmm2 = simde_mm_cmpunord_ps(src1, src1);
simde__m128i xmm0 = simde_mm_cvttps_epi32(src1);
simde__m128i xmm1 = simde_mm_cmpeq_epi32(xmm0, simde_mm_set1_epi32(INT_MIN));
xmm1 = simde_mm_andnot_si128(simde_mm_castps_si128(src1), xmm1);
simde__m128 dest = simde_mm_blendv_ps(simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(simde_mm_set1_epi32(INT_MAX)), simde_mm_castsi128_ps(xmm1));
return simde_mm_andnot_si128(simde_mm_castps_si128(xmm2), simde_mm_castps_si128(dest));
__m128 xmm2 = _mm_cmpunord_ps(src1, src1);
__m128i xmm0 = _mm_cvttps_epi32(src1);
__m128i xmm1 = _mm_cmpeq_epi32(xmm0, _mm_set1_epi32(INT_MIN));
xmm1 = _mm_andnot_si128(_mm_castps_si128(src1), xmm1);
__m128 dest = _mm_blendv_ps(_mm_castsi128_ps(xmm0), _mm_castsi128_ps(_mm_set1_epi32(INT_MAX)), _mm_castsi128_ps(xmm1));
return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest));
}
inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b)
inline __m128i _mm_vctuxs(__m128 src1)
{
b = simde_mm_srli_epi64(simde_mm_slli_epi64(b, 61), 61);
return simde_mm_castps_si128(simde_mm_insert_ps(simde_mm_castsi128_ps(simde_mm_srl_epi64(a, b)), simde_mm_castsi128_ps(simde_mm_srl_epi64(simde_mm_srli_si128(a, 4), b)), 0x10));
__m128 xmm0 = _mm_max_ps(src1, _mm_set1_epi32(0));
__m128 xmm1 = _mm_cmpge_ps(xmm0, _mm_set1_ps((float)0x80000000));
__m128 xmm2 = _mm_sub_ps(xmm0, _mm_set1_ps((float)0x80000000));
xmm0 = _mm_blendv_ps(xmm0, xmm2, xmm1);
__m128i dest = _mm_cvttps_epi32(xmm0);
xmm0 = _mm_cmpeq_epi32(dest, _mm_set1_epi32(INT_MIN));
xmm1 = _mm_and_si128(xmm1, _mm_set1_epi32(INT_MIN));
dest = _mm_add_epi32(dest, xmm1);
return _mm_or_si128(dest, xmm0);
}
#if defined(__aarch64__) || defined(_M_ARM64)
inline uint64_t __rdtsc()
inline __m128i _mm_vsr(__m128i a, __m128i b)
{
uint64_t ret;
asm volatile("mrs %0, cntvct_el0\n\t"
: "=r"(ret)::"memory");
return ret;
b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61);
return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10));
}
#elif !defined(__x86_64__) && !defined(_M_X64)
# error "Missing implementation for __rdtsc()"
#endif
#endif

View File

@@ -141,7 +141,7 @@ inline bool bitScanForward(uint64_t v, uint32_t *outFirstSetIndex)
}
#endif
int lzxDecompress(const void *lzxData, size_t lzxLength, void *dst, size_t dstLength, uint32_t windowSize, void *windowData, size_t windowDataLength)
static int lzxDecompress(const void *lzxData, size_t lzxLength, void *dst, size_t dstLength, uint32_t windowSize, void *windowData, size_t windowDataLength)
{
int resultCode = 1;
uint32_t windowBits;

1
thirdparty/simde vendored

Submodule thirdparty/simde deleted from a532a12ca9