make store instructions check for mmio

fix vaddsws implementation
add remaining altivec instructions
2025-11-04 06:47:09 +00:00 · 2025-07-07 20:35:10 +02:00 · 2025-07-07 20:35:10 +02:00 · 2025-07-07 20:35:08 +02:00 · 2025-07-07 20:33:33 +02:00 · 2025-07-07 20:33:30 +02:00
8 changed files with 480 additions and 210 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,6 +13,3 @@
 [submodule "thirdparty/tiny-AES-c"]
 	path = thirdparty/tiny-AES-c
 	url = https://github.com/kokke/tiny-AES-c.git
-[submodule "thirdparty/simde"]
-	path = thirdparty/simde
-	url = https://github.com/simd-everywhere/simde-no-tests.git
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # XenonRecomp

-XenonRecomp is a tool that converts Xbox 360 executables into C++ code, which can then be recompiled for any platform.
+XenonRecomp is a tool that converts Xbox 360 executables into C++ code, which can then be recompiled for any platform. Currently, it only supports x86 platforms due to the use of x86 intrinsics.

 This project was heavily inspired by [N64: Recompiled](https://github.com/N64Recomp/N64Recomp), a similar tool for N64 executables.

@@ -20,7 +20,7 @@ Vector registers' endianness handling is more complicated. Instead of swapping i

 The FPU expects denormalized numbers to remain unmodified, while VMX instructions always flush them. This is managed by storing the current floating-point state in the CPU state struct and enabling or disabling denormal flushing as necessary before executing each instruction.

-Most VMX instructions are implemented using x86 intrinsics. Support for ARM64 is implemented using [SIMD Everywhere](https://github.com/simd-everywhere/simde).
+Most VMX instructions are implemented using x86 intrinsics. Luckily, the number of AVX intrinsics used is relatively low, so adding support for other architectures using libraries like [SIMD Everywhere](https://github.com/simd-everywhere/simde) might be possible.

 ### MMIO

--- a/XenonRecomp/pch.h
+++ b/XenonRecomp/pch.h
@@ -16,4 +16,4 @@
 #include <xbox.h>
 #include <xxhash.h>
 #include <fmt/core.h>
-#include <x86/sse.h>
+#include <xmmintrin.h>
--- a/XenonRecomp/recompiler.cpp
+++ b/XenonRecomp/recompiler.cpp
--- a/XenonUtils/CMakeLists.txt
+++ b/XenonUtils/CMakeLists.txt
@@ -17,9 +17,8 @@ target_compile_definitions(XenonUtils
 )

 target_include_directories(XenonUtils 
-    PUBLIC
+    PUBLIC 
        .
-        "${THIRDPARTY_ROOT}/simde"
    PRIVATE
        "${THIRDPARTY_ROOT}/libmspack/libmspack/mspack"
        "${THIRDPARTY_ROOT}/tiny-AES-c"
--- a/XenonUtils/ppc_context.h
+++ b/XenonUtils/ppc_context.h
@@ -12,13 +12,13 @@
 #include <cstdlib>
 #include <cstring>

-#include <x86/avx.h>
-#include <x86/sse.h>
-#include <x86/sse4.1.h>
+#include <x86intrin.h>

-// SSE3 constants are missing from simde
-#ifndef _MM_DENORMALS_ZERO_MASK
-#define _MM_DENORMALS_ZERO_MASK 0x0040
+#ifdef _WIN32
+#include <intrin.h>
+#else
+#include <xmmintrin.h>
+#include <smmintrin.h>
 #endif

 #define PPC_JOIN(x, y) x##y
@@ -172,18 +172,18 @@ struct PPCCRRegister
        eq = !un && (left == right);
    }

-    inline void setFromMask(simde__m128 mask, int imm) noexcept
+    inline void setFromMask(__m128 mask, int imm) noexcept
    {
-        int m = simde_mm_movemask_ps(mask);
+        int m = _mm_movemask_ps(mask);
        lt = m == imm; // all equal
        gt = 0;
        eq = m == 0; // none equal
        so = 0;
    }

-    inline void setFromMask(simde__m128i mask, int imm) noexcept
+    inline void setFromMask(__m128i mask, int imm) noexcept
    {
-        int m = simde_mm_movemask_epi8(mask);
+        int m = _mm_movemask_epi8(mask);
        lt = m == imm; // all equal
        gt = 0;
        eq = m == 0; // none equal
@@ -215,71 +215,34 @@ struct PPCFPSCRRegister
 {
    uint32_t csr;

+    static constexpr size_t GuestToHost[] = { _MM_ROUND_NEAREST, _MM_ROUND_TOWARD_ZERO, _MM_ROUND_UP, _MM_ROUND_DOWN };
    static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO };

-    // simde does not handle denormal flags, so we need to implement per-arch.
-#if defined(__x86_64__) || defined(_M_X64)
-    static constexpr size_t RoundShift = 13;
-    static constexpr size_t RoundMask = SIMDE_MM_ROUND_MASK;
-    static constexpr size_t FlushMask = SIMDE_MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
-    static constexpr size_t GuestToHost[] = { SIMDE_MM_ROUND_NEAREST, SIMDE_MM_ROUND_TOWARD_ZERO, SIMDE_MM_ROUND_UP, SIMDE_MM_ROUND_DOWN };
-
-    inline uint32_t getcsr() noexcept
-    {
-        return simde_mm_getcsr();
-    }
-
-    inline void setcsr(uint32_t csr) noexcept
-    {
-        simde_mm_setcsr(csr);
-    }
-#elif defined(__aarch64__) || defined(_M_ARM64)
-    // RMode
-    static constexpr size_t RoundShift = 22;
-    static constexpr size_t RoundMask = 3 << RoundShift;
-    // FZ and FZ16
-    static constexpr size_t FlushMask = (1 << 19) | (1 << 24);
-    // Nearest, Zero, -Infinity, -Infinity
-    static constexpr size_t GuestToHost[] = { 0 << RoundShift, 3 << RoundShift, 1 << RoundShift, 2 << RoundShift };
-
-    inline uint32_t getcsr() noexcept
-    {
-        uint64_t csr;
-        __asm__ __volatile__("mrs %0, fpcr" : "=r"(csr));
-        return csr;
-    }
-
-    inline void setcsr(uint32_t csr) noexcept
-    {
-        __asm__ __volatile__("msr fpcr, %0" : : "r"(csr));
-    }
-#else
-#   error "Missing implementation for FPSCR."
-#endif
-
    inline uint32_t loadFromHost() noexcept
    {
-        csr = getcsr();
-        return HostToGuest[(csr & RoundMask) >> RoundShift];
+        csr = _mm_getcsr();
+        return HostToGuest[(csr & _MM_ROUND_MASK) >> 13];
    }
        
    inline void storeFromGuest(uint32_t value) noexcept
    {
-        csr &= ~RoundMask;
+        csr &= ~_MM_ROUND_MASK;
        csr |= GuestToHost[value & PPC_ROUND_MASK];
-        setcsr(csr);
+        _mm_setcsr(csr);
    }

+    static constexpr size_t FlushMask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
+
    inline void enableFlushModeUnconditional() noexcept
    {
        csr |= FlushMask;
-        setcsr(csr);
+        _mm_setcsr(csr);
    }

    inline void disableFlushModeUnconditional() noexcept
    {
        csr &= ~FlushMask;
-        setcsr(csr);
+        _mm_setcsr(csr);
    }

    inline void enableFlushMode() noexcept
@@ -287,7 +250,7 @@ struct PPCFPSCRRegister
        if ((csr & FlushMask) != FlushMask) [[unlikely]]
        {
            csr |= FlushMask;
-            setcsr(csr);
+            _mm_setcsr(csr);
        }
    }

@@ -296,7 +259,7 @@ struct PPCFPSCRRegister
        if ((csr & FlushMask) != 0) [[unlikely]]
        {
            csr &= ~FlushMask;
-            setcsr(csr);
+            _mm_setcsr(csr);
        }
    }
 };
@@ -624,80 +587,81 @@ inline uint8_t VectorShiftTableR[] =
    0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
 };

-inline simde__m128i simde_mm_adds_epu32(simde__m128i a, simde__m128i b)
+inline __m128i _mm_adds_epu32(__m128i a, __m128i b) 
 {
-    return simde_mm_add_epi32(a, simde_mm_min_epu32(simde_mm_xor_si128(a, simde_mm_cmpeq_epi32(a, a)), b));
+    return _mm_add_epi32(a, _mm_min_epu32(_mm_xor_si128(a, _mm_cmpeq_epi32(a, a)), b));
 }

-inline simde__m128i simde_mm_avg_epi8(simde__m128i a, simde__m128i b)
+inline __m128i _mm_avg_epi8(__m128i a, __m128i b)
 {
-    simde__m128i c = simde_mm_set1_epi8(char(128));
-    return simde_mm_xor_si128(c, simde_mm_avg_epu8(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
+    __m128i c = _mm_set1_epi8(char(128));
+    return _mm_xor_si128(c, _mm_avg_epu8(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
 }

-inline simde__m128i simde_mm_avg_epi16(simde__m128i a, simde__m128i b)
+inline __m128i _mm_avg_epi16(__m128i a, __m128i b)
 {
-    simde__m128i c = simde_mm_set1_epi16(short(32768));
-    return simde_mm_xor_si128(c, simde_mm_avg_epu16(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
+    __m128i c = _mm_set1_epi16(short(32768));
+    return _mm_xor_si128(c, _mm_avg_epu16(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
 }

-inline simde__m128 simde_mm_cvtepu32_ps_(simde__m128i src1)
+inline __m128 _mm_cvtepu32_ps_(__m128i src1)
 {
-    simde__m128i xmm1 = simde_mm_add_epi32(src1, simde_mm_set1_epi32(127));
-    simde__m128i xmm0 = simde_mm_slli_epi32(src1, 31 - 8);
-    xmm0 = simde_mm_srli_epi32(xmm0, 31);
-    xmm0 = simde_mm_add_epi32(xmm0, xmm1);
-    xmm0 = simde_mm_srai_epi32(xmm0, 8);
-    xmm0 = simde_mm_add_epi32(xmm0, simde_mm_set1_epi32(0x4F800000));
-    simde__m128 xmm2 = simde_mm_cvtepi32_ps(src1);
-    return simde_mm_blendv_ps(xmm2, simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(src1));
+    __m128i xmm1 = _mm_add_epi32(src1, _mm_set1_epi32(127));
+    __m128i xmm0 = _mm_slli_epi32(src1, 31 - 8);
+    xmm0 = _mm_srli_epi32(xmm0, 31);
+    xmm0 = _mm_add_epi32(xmm0, xmm1);
+    xmm0 = _mm_srai_epi32(xmm0, 8);
+    xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(0x4F800000));
+    __m128 xmm2 = _mm_cvtepi32_ps(src1);
+    return _mm_blendv_ps(xmm2, _mm_castsi128_ps(xmm0), _mm_castsi128_ps(src1));
 }

-inline simde__m128i simde_mm_perm_epi8_(simde__m128i a, simde__m128i b, simde__m128i c)
+inline __m128i _mm_perm_epi8_(__m128i a, __m128i b, __m128i c)
 {
-    simde__m128i d = simde_mm_set1_epi8(0xF);
-    simde__m128i e = simde_mm_sub_epi8(d, simde_mm_and_si128(c, d));
-    return simde_mm_blendv_epi8(simde_mm_shuffle_epi8(a, e), simde_mm_shuffle_epi8(b, e), simde_mm_slli_epi32(c, 3));
+    __m128i d = _mm_set1_epi8(0xF);
+    __m128i e = _mm_sub_epi8(d, _mm_and_si128(c, d));
+    return _mm_blendv_epi8(_mm_shuffle_epi8(a, e), _mm_shuffle_epi8(b, e), _mm_slli_epi32(c, 3));
 }

-inline simde__m128i simde_mm_cmpgt_epu8(simde__m128i a, simde__m128i b)
+inline __m128i _mm_cmpgt_epu8(__m128i a, __m128i b)
 {
-    simde__m128i c = simde_mm_set1_epi8(char(128));
-    return simde_mm_cmpgt_epi8(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
+    __m128i c = _mm_set1_epi8(char(128));
+    return _mm_cmpgt_epi8(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
 }

-inline simde__m128i simde_mm_cmpgt_epu16(simde__m128i a, simde__m128i b)
+inline __m128i _mm_cmpgt_epu16(__m128i a, __m128i b)
 {
-    simde__m128i c = simde_mm_set1_epi16(short(32768));
-    return simde_mm_cmpgt_epi16(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
+    __m128i c = _mm_set1_epi16(short(32768));
+    return _mm_cmpgt_epi16(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
 }

-inline simde__m128i simde_mm_vctsxs(simde__m128 src1)
+inline __m128i _mm_vctsxs(__m128 src1)
 {
-    simde__m128 xmm2 = simde_mm_cmpunord_ps(src1, src1);
-    simde__m128i xmm0 = simde_mm_cvttps_epi32(src1);
-    simde__m128i xmm1 = simde_mm_cmpeq_epi32(xmm0, simde_mm_set1_epi32(INT_MIN));
-    xmm1 = simde_mm_andnot_si128(simde_mm_castps_si128(src1), xmm1);
-    simde__m128 dest = simde_mm_blendv_ps(simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(simde_mm_set1_epi32(INT_MAX)), simde_mm_castsi128_ps(xmm1));
-    return simde_mm_andnot_si128(simde_mm_castps_si128(xmm2), simde_mm_castps_si128(dest));
+    __m128 xmm2 = _mm_cmpunord_ps(src1, src1);
+    __m128i xmm0 = _mm_cvttps_epi32(src1);
+    __m128i xmm1 = _mm_cmpeq_epi32(xmm0, _mm_set1_epi32(INT_MIN));
+    xmm1 = _mm_andnot_si128(_mm_castps_si128(src1), xmm1);
+    __m128 dest = _mm_blendv_ps(_mm_castsi128_ps(xmm0), _mm_castsi128_ps(_mm_set1_epi32(INT_MAX)), _mm_castsi128_ps(xmm1));
+    return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest));
 }

-inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b)
+inline __m128i _mm_vctuxs(__m128 src1)
 {
-    b = simde_mm_srli_epi64(simde_mm_slli_epi64(b, 61), 61);
-    return simde_mm_castps_si128(simde_mm_insert_ps(simde_mm_castsi128_ps(simde_mm_srl_epi64(a, b)), simde_mm_castsi128_ps(simde_mm_srl_epi64(simde_mm_srli_si128(a, 4), b)), 0x10));
+    __m128 xmm0 = _mm_max_ps(src1, _mm_set1_epi32(0));
+    __m128 xmm1 = _mm_cmpge_ps(xmm0, _mm_set1_ps((float)0x80000000));
+    __m128 xmm2 = _mm_sub_ps(xmm0, _mm_set1_ps((float)0x80000000));
+    xmm0 = _mm_blendv_ps(xmm0, xmm2, xmm1);
+    __m128i dest = _mm_cvttps_epi32(xmm0);
+    xmm0 = _mm_cmpeq_epi32(dest, _mm_set1_epi32(INT_MIN));
+    xmm1 = _mm_and_si128(xmm1, _mm_set1_epi32(INT_MIN));
+    dest = _mm_add_epi32(dest, xmm1);
+    return _mm_or_si128(dest, xmm0);
 }

-#if defined(__aarch64__) || defined(_M_ARM64)
-inline uint64_t __rdtsc()
+inline __m128i _mm_vsr(__m128i a, __m128i b)
 {
-    uint64_t ret;
-    asm volatile("mrs %0, cntvct_el0\n\t"
-                 : "=r"(ret)::"memory");
-    return ret;
+    b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61);
+    return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10));
 }
-#elif !defined(__x86_64__) && !defined(_M_X64)
-#   error "Missing implementation for __rdtsc()"
-#endif

 #endif
--- a/XenonUtils/xex_patcher.cpp
+++ b/XenonUtils/xex_patcher.cpp
@@ -141,7 +141,7 @@ inline bool bitScanForward(uint64_t v, uint32_t *outFirstSetIndex)
 }
 #endif

-int lzxDecompress(const void *lzxData, size_t lzxLength, void *dst, size_t dstLength, uint32_t windowSize, void *windowData, size_t windowDataLength)
+static int lzxDecompress(const void *lzxData, size_t lzxLength, void *dst, size_t dstLength, uint32_t windowSize, void *windowData, size_t windowDataLength)
 {
    int resultCode = 1;
    uint32_t windowBits;
--- a/thirdparty/simde
+++ b/thirdparty/simde
Author	SHA1	Message	Date
DeaTh-G	9ff80d8321	make store instructions check for mmio	2025-07-07 20:35:10 +02:00
DeaTh-G	830be1f69a	fix vaddsws implementation	2025-07-07 20:35:10 +02:00
DeaTh-G	a5d6382975	add remaining altivec instructions	2025-07-07 20:35:08 +02:00
DeaTh-G	1d452c60a8	add vpkuhus implementation	2025-07-07 20:33:33 +02:00
DeaTh-G	cea0b2fc38	Fix instruction implementations based on unit tests	2025-07-07 20:33:30 +02:00
DeaTh-G	f6193ebe43	add more basic instructions	2025-07-07 20:31:58 +02:00
DeaTh-G	f23d22bc7f	Fix indexing on certain instructions	2025-07-07 20:31:57 +02:00
DeaTh-G	847b750786	Add more instructions regarding Bakugan Battle Brawlers	2025-07-07 20:31:57 +02:00