mirror of
				https://github.com/hedge-dev/XenonRecomp.git
				synced 2025-10-30 07:11:38 +00:00 
			
		
		
		
	 c5bfd90d87
			
		
	
	
		c5bfd90d87
		
			
		
	
	
	
	
		
			
			* Inital add SIMD-E Signed-off-by: Isaac Marovitz <isaacryu@icloud.com> * Include simde in ppc_context.h Signed-off-by: Isaac Marovitz <isaacryu@icloud.com> * Fix missing defines, don’t use native aliases Signed-off-by: Isaac Marovitz <isaacryu@icloud.com> * Update recompiler Signed-off-by: Isaac Marovitz <isaacryu@icloud.com> * Correct fround constants Signed-off-by: Isaac Marovitz <isaacryu@icloud.com> * Implement __rdtsc and FPCSR for ARM64. * Remove static Signed-off-by: Isaac Marovitz <isaacryu@icloud.com> --------- Signed-off-by: Isaac Marovitz <isaacryu@icloud.com> Co-authored-by: squidbus <175574877+squidbus@users.noreply.github.com>
		
			
				
	
	
		
			703 lines
		
	
	
	
		
			22 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			703 lines
		
	
	
	
		
			22 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #ifndef PPC_CONTEXT_H_INCLUDED
 | |
| #define PPC_CONTEXT_H_INCLUDED
 | |
| 
 | |
| #ifndef PPC_CONFIG_H_INCLUDED
 | |
| #error "ppc_config.h must be included before ppc_context.h"
 | |
| #endif
 | |
| 
 | |
| #include <climits>
 | |
| #include <cmath>
 | |
| #include <csetjmp>
 | |
| #include <cstdint>
 | |
| #include <cstdlib>
 | |
| #include <cstring>
 | |
| 
 | |
| #include <x86/avx.h>
 | |
| #include <x86/sse.h>
 | |
| #include <x86/sse4.1.h>
 | |
| 
 | |
| // SSE3 constants are missing from simde
 | |
| #ifndef _MM_DENORMALS_ZERO_MASK
 | |
| #define _MM_DENORMALS_ZERO_MASK 0x0040
 | |
| #endif
 | |
| 
 | |
| #define PPC_JOIN(x, y) x##y
 | |
| #define PPC_XSTRINGIFY(x) #x
 | |
| #define PPC_STRINGIFY(x) PPC_XSTRINGIFY(x)
 | |
| #define PPC_FUNC(x) void x(PPCContext& __restrict ctx, uint8_t* base)
 | |
| #define PPC_FUNC_IMPL(x) extern "C" PPC_FUNC(x)
 | |
| #define PPC_EXTERN_FUNC(x) extern PPC_FUNC(x)
 | |
| #define PPC_WEAK_FUNC(x) __attribute__((weak,noinline)) PPC_FUNC(x)
 | |
| 
 | |
| #define PPC_FUNC_PROLOGUE() __builtin_assume(((size_t)base & 0x1F) == 0)
 | |
| 
 | |
| #ifndef PPC_LOAD_U8
 | |
| #define PPC_LOAD_U8(x) *(volatile uint8_t*)(base + (x))
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_LOAD_U16
 | |
| #define PPC_LOAD_U16(x) __builtin_bswap16(*(volatile uint16_t*)(base + (x)))
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_LOAD_U32
 | |
| #define PPC_LOAD_U32(x) __builtin_bswap32(*(volatile uint32_t*)(base + (x)))
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_LOAD_U64
 | |
| #define PPC_LOAD_U64(x) __builtin_bswap64(*(volatile uint64_t*)(base + (x)))
 | |
| #endif
 | |
| 
 | |
| // TODO: Implement.
 | |
| // These are currently unused. However, MMIO loads could possibly be handled statically with some profiling and a fallback.
 | |
| // The fallback would be a runtime exception handler which will intercept reads from MMIO regions 
 | |
| // and log the PC for compiling to static code later.
 | |
| #ifndef PPC_MM_LOAD_U8
 | |
| #define PPC_MM_LOAD_U8(x)  PPC_LOAD_U8 (x)
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_MM_LOAD_U16
 | |
| #define PPC_MM_LOAD_U16(x) PPC_LOAD_U16(x)
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_MM_LOAD_U32
 | |
| #define PPC_MM_LOAD_U32(x) PPC_LOAD_U32(x)
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_MM_LOAD_U64
 | |
| #define PPC_MM_LOAD_U64(x) PPC_LOAD_U64(x)
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_STORE_U8
 | |
| #define PPC_STORE_U8(x, y) *(volatile uint8_t*)(base + (x)) = (y)
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_STORE_U16
 | |
| #define PPC_STORE_U16(x, y) *(volatile uint16_t*)(base + (x)) = __builtin_bswap16(y)
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_STORE_U32
 | |
| #define PPC_STORE_U32(x, y) *(volatile uint32_t*)(base + (x)) = __builtin_bswap32(y)
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_STORE_U64
 | |
| #define PPC_STORE_U64(x, y) *(volatile uint64_t*)(base + (x)) = __builtin_bswap64(y)
 | |
| #endif
 | |
| 
 | |
| // MMIO Store handling is completely reliant on being preeceded by eieio.
 | |
| // TODO: Verify if that's always the case.
 | |
| #ifndef PPC_MM_STORE_U8
 | |
| #define PPC_MM_STORE_U8(x, y)   PPC_STORE_U8 (x, y)
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_MM_STORE_U16
 | |
| #define PPC_MM_STORE_U16(x, y)  PPC_STORE_U16(x, y)
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_MM_STORE_U32
 | |
| #define PPC_MM_STORE_U32(x, y)  PPC_STORE_U32(x, y)
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_MM_STORE_U64
 | |
| #define PPC_MM_STORE_U64(x, y)  PPC_STORE_U64(x, y)
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_CALL_FUNC
 | |
| #define PPC_CALL_FUNC(x) x(ctx, base)
 | |
| #endif
 | |
| 
 | |
| #define PPC_MEMORY_SIZE 0x100000000ull
 | |
| 
 | |
| #define PPC_LOOKUP_FUNC(x, y) *(PPCFunc**)(x + PPC_IMAGE_BASE + PPC_IMAGE_SIZE + (uint64_t(uint32_t(y) - PPC_CODE_BASE) * 2))
 | |
| 
 | |
| #ifndef PPC_CALL_INDIRECT_FUNC
 | |
| #define PPC_CALL_INDIRECT_FUNC(x) (PPC_LOOKUP_FUNC(base, x))(ctx, base)
 | |
| #endif
 | |
| 
 | |
| typedef void PPCFunc(struct PPCContext& __restrict__ ctx, uint8_t* base);
 | |
| 
 | |
| struct PPCFuncMapping
 | |
| {
 | |
|     size_t guest;
 | |
|     PPCFunc* host;
 | |
| };
 | |
| 
 | |
| extern PPCFuncMapping PPCFuncMappings[];
 | |
| 
 | |
| union PPCRegister
 | |
| {
 | |
|     int8_t s8;
 | |
|     uint8_t u8;
 | |
|     int16_t s16;
 | |
|     uint16_t u16;
 | |
|     int32_t s32;
 | |
|     uint32_t u32;
 | |
|     int64_t s64;
 | |
|     uint64_t u64;
 | |
|     float f32;
 | |
|     double f64;
 | |
| };
 | |
| 
 | |
| struct PPCXERRegister
 | |
| {
 | |
|     uint8_t so;
 | |
|     uint8_t ov;
 | |
|     uint8_t ca;
 | |
| };
 | |
| 
 | |
| struct PPCCRRegister
 | |
| {
 | |
|     uint8_t lt;
 | |
|     uint8_t gt;
 | |
|     uint8_t eq;
 | |
|     union
 | |
|     {
 | |
|         uint8_t so;
 | |
|         uint8_t un;
 | |
|     };
 | |
| 
 | |
|     template<typename T>
 | |
|     inline void compare(T left, T right, const PPCXERRegister& xer) noexcept
 | |
|     {
 | |
|         lt = left < right;
 | |
|         gt = left > right;
 | |
|         eq = left == right;
 | |
|         so = xer.so;
 | |
|     }
 | |
| 
 | |
|     inline void compare(double left, double right) noexcept
 | |
|     {
 | |
|         un = __builtin_isnan(left) || __builtin_isnan(right);
 | |
|         lt = !un && (left < right);
 | |
|         gt = !un && (left > right);
 | |
|         eq = !un && (left == right);
 | |
|     }
 | |
| 
 | |
|     inline void setFromMask(simde__m128 mask, int imm) noexcept
 | |
|     {
 | |
|         int m = simde_mm_movemask_ps(mask);
 | |
|         lt = m == imm; // all equal
 | |
|         gt = 0;
 | |
|         eq = m == 0; // none equal
 | |
|         so = 0;
 | |
|     }
 | |
| 
 | |
|     inline void setFromMask(simde__m128i mask, int imm) noexcept
 | |
|     {
 | |
|         int m = simde_mm_movemask_epi8(mask);
 | |
|         lt = m == imm; // all equal
 | |
|         gt = 0;
 | |
|         eq = m == 0; // none equal
 | |
|         so = 0;
 | |
|     }
 | |
| };
 | |
| 
 | |
| union alignas(0x10) PPCVRegister
 | |
| {
 | |
|     int8_t s8[16];
 | |
|     uint8_t u8[16];
 | |
|     int16_t s16[8];
 | |
|     uint16_t u16[8];
 | |
|     int32_t s32[4];
 | |
|     uint32_t u32[4];
 | |
|     int64_t s64[2];
 | |
|     uint64_t u64[2];
 | |
|     float f32[4];
 | |
|     double f64[2];
 | |
| };
 | |
| 
 | |
| #define PPC_ROUND_NEAREST 0x00
 | |
| #define PPC_ROUND_TOWARD_ZERO 0x01
 | |
| #define PPC_ROUND_UP 0x02
 | |
| #define PPC_ROUND_DOWN 0x03
 | |
| #define PPC_ROUND_MASK 0x03
 | |
| 
 | |
| struct PPCFPSCRRegister
 | |
| {
 | |
|     uint32_t csr;
 | |
| 
 | |
|     static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO };
 | |
| 
 | |
|     // simde does not handle denormal flags, so we need to implement per-arch.
 | |
| #if defined(__x86_64__) || defined(_M_X64)
 | |
|     static constexpr size_t RoundShift = 13;
 | |
|     static constexpr size_t RoundMask = SIMDE_MM_ROUND_MASK;
 | |
|     static constexpr size_t FlushMask = SIMDE_MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
 | |
|     static constexpr size_t GuestToHost[] = { SIMDE_MM_ROUND_NEAREST, SIMDE_MM_ROUND_TOWARD_ZERO, SIMDE_MM_ROUND_UP, SIMDE_MM_ROUND_DOWN };
 | |
| 
 | |
|     inline uint32_t getcsr() noexcept
 | |
|     {
 | |
|         return simde_mm_getcsr();
 | |
|     }
 | |
| 
 | |
|     inline void setcsr(uint32_t csr) noexcept
 | |
|     {
 | |
|         simde_mm_setcsr(csr);
 | |
|     }
 | |
| #elif defined(__aarch64__) || defined(_M_ARM64)
 | |
|     // RMode
 | |
|     static constexpr size_t RoundShift = 22;
 | |
|     static constexpr size_t RoundMask = 3 << RoundShift;
 | |
|     // FZ and FZ16
 | |
|     static constexpr size_t FlushMask = (1 << 19) | (1 << 24);
 | |
|     // Nearest, Zero, -Infinity, -Infinity
 | |
|     static constexpr size_t GuestToHost[] = { 0 << RoundShift, 3 << RoundShift, 1 << RoundShift, 2 << RoundShift };
 | |
| 
 | |
|     inline uint32_t getcsr() noexcept
 | |
|     {
 | |
|         uint64_t csr;
 | |
|         __asm__ __volatile__("mrs %0, fpcr" : "=r"(csr));
 | |
|         return csr;
 | |
|     }
 | |
| 
 | |
|     inline void setcsr(uint32_t csr) noexcept
 | |
|     {
 | |
|         __asm__ __volatile__("msr fpcr, %0" : : "r"(csr));
 | |
|     }
 | |
| #else
 | |
| #   error "Missing implementation for FPSCR."
 | |
| #endif
 | |
| 
 | |
|     inline uint32_t loadFromHost() noexcept
 | |
|     {
 | |
|         csr = getcsr();
 | |
|         return HostToGuest[(csr & RoundMask) >> RoundShift];
 | |
|     }
 | |
|         
 | |
|     inline void storeFromGuest(uint32_t value) noexcept
 | |
|     {
 | |
|         csr &= ~RoundMask;
 | |
|         csr |= GuestToHost[value & PPC_ROUND_MASK];
 | |
|         setcsr(csr);
 | |
|     }
 | |
| 
 | |
|     inline void enableFlushModeUnconditional() noexcept
 | |
|     {
 | |
|         csr |= FlushMask;
 | |
|         setcsr(csr);
 | |
|     }
 | |
| 
 | |
|     inline void disableFlushModeUnconditional() noexcept
 | |
|     {
 | |
|         csr &= ~FlushMask;
 | |
|         setcsr(csr);
 | |
|     }
 | |
| 
 | |
|     inline void enableFlushMode() noexcept
 | |
|     {
 | |
|         if ((csr & FlushMask) != FlushMask) [[unlikely]]
 | |
|         {
 | |
|             csr |= FlushMask;
 | |
|             setcsr(csr);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     inline void disableFlushMode() noexcept
 | |
|     {
 | |
|         if ((csr & FlushMask) != 0) [[unlikely]]
 | |
|         {
 | |
|             csr &= ~FlushMask;
 | |
|             setcsr(csr);
 | |
|         }
 | |
|     }
 | |
| };
 | |
| 
 | |
| struct alignas(0x40) PPCContext
 | |
| {
 | |
|     PPCRegister r3;
 | |
| #ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL
 | |
|     PPCRegister r0;
 | |
| #endif
 | |
|     PPCRegister r1;
 | |
| #ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL
 | |
|     PPCRegister r2;
 | |
| #endif
 | |
|     PPCRegister r4;
 | |
|     PPCRegister r5;
 | |
|     PPCRegister r6;
 | |
|     PPCRegister r7;
 | |
|     PPCRegister r8;
 | |
|     PPCRegister r9;
 | |
|     PPCRegister r10;
 | |
| #ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL
 | |
|     PPCRegister r11;
 | |
|     PPCRegister r12;
 | |
| #endif
 | |
|     PPCRegister r13;
 | |
| #ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL
 | |
|     PPCRegister r14;
 | |
|     PPCRegister r15;
 | |
|     PPCRegister r16;
 | |
|     PPCRegister r17;
 | |
|     PPCRegister r18;
 | |
|     PPCRegister r19;
 | |
|     PPCRegister r20;
 | |
|     PPCRegister r21;
 | |
|     PPCRegister r22;
 | |
|     PPCRegister r23;
 | |
|     PPCRegister r24;
 | |
|     PPCRegister r25;
 | |
|     PPCRegister r26;
 | |
|     PPCRegister r27;
 | |
|     PPCRegister r28;
 | |
|     PPCRegister r29;
 | |
|     PPCRegister r30;
 | |
|     PPCRegister r31;
 | |
| #endif
 | |
| 
 | |
| #ifndef PPC_CONFIG_SKIP_LR
 | |
|     uint64_t lr;
 | |
| #endif
 | |
| #ifndef PPC_CONFIG_CTR_AS_LOCAL
 | |
|     PPCRegister ctr;
 | |
| #endif
 | |
| #ifndef PPC_CONFIG_XER_AS_LOCAL
 | |
|     PPCXERRegister xer;
 | |
| #endif
 | |
| #ifndef PPC_CONFIG_RESERVED_AS_LOCAL
 | |
|     PPCRegister reserved;
 | |
| #endif
 | |
| #ifndef PPC_CONFIG_SKIP_MSR
 | |
|     uint32_t msr = 0x200A000;
 | |
| #endif
 | |
| #ifndef PPC_CONFIG_CR_AS_LOCAL
 | |
|     PPCCRRegister cr0;
 | |
|     PPCCRRegister cr1;
 | |
|     PPCCRRegister cr2;
 | |
|     PPCCRRegister cr3;
 | |
|     PPCCRRegister cr4;
 | |
|     PPCCRRegister cr5;
 | |
|     PPCCRRegister cr6;
 | |
|     PPCCRRegister cr7;
 | |
| #endif
 | |
|     PPCFPSCRRegister fpscr;
 | |
| 
 | |
| #ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL
 | |
|     PPCRegister f0;
 | |
| #endif
 | |
|     PPCRegister f1;
 | |
|     PPCRegister f2;
 | |
|     PPCRegister f3;
 | |
|     PPCRegister f4;
 | |
|     PPCRegister f5;
 | |
|     PPCRegister f6;
 | |
|     PPCRegister f7;
 | |
|     PPCRegister f8;
 | |
|     PPCRegister f9;
 | |
|     PPCRegister f10;
 | |
|     PPCRegister f11;
 | |
|     PPCRegister f12;
 | |
|     PPCRegister f13;
 | |
| #ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL
 | |
|     PPCRegister f14;
 | |
|     PPCRegister f15;
 | |
|     PPCRegister f16;
 | |
|     PPCRegister f17;
 | |
|     PPCRegister f18;
 | |
|     PPCRegister f19;
 | |
|     PPCRegister f20;
 | |
|     PPCRegister f21;
 | |
|     PPCRegister f22;
 | |
|     PPCRegister f23;
 | |
|     PPCRegister f24;
 | |
|     PPCRegister f25;
 | |
|     PPCRegister f26;
 | |
|     PPCRegister f27;
 | |
|     PPCRegister f28;
 | |
|     PPCRegister f29;
 | |
|     PPCRegister f30;
 | |
|     PPCRegister f31;
 | |
| #endif
 | |
| 
 | |
|     PPCVRegister v0;
 | |
|     PPCVRegister v1;
 | |
|     PPCVRegister v2;
 | |
|     PPCVRegister v3;
 | |
|     PPCVRegister v4;
 | |
|     PPCVRegister v5;
 | |
|     PPCVRegister v6;
 | |
|     PPCVRegister v7;
 | |
|     PPCVRegister v8;
 | |
|     PPCVRegister v9;
 | |
|     PPCVRegister v10;
 | |
|     PPCVRegister v11;
 | |
|     PPCVRegister v12;
 | |
|     PPCVRegister v13;
 | |
| #ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL
 | |
|     PPCVRegister v14;
 | |
|     PPCVRegister v15;
 | |
|     PPCVRegister v16;
 | |
|     PPCVRegister v17;
 | |
|     PPCVRegister v18;
 | |
|     PPCVRegister v19;
 | |
|     PPCVRegister v20;
 | |
|     PPCVRegister v21;
 | |
|     PPCVRegister v22;
 | |
|     PPCVRegister v23;
 | |
|     PPCVRegister v24;
 | |
|     PPCVRegister v25;
 | |
|     PPCVRegister v26;
 | |
|     PPCVRegister v27;
 | |
|     PPCVRegister v28;
 | |
|     PPCVRegister v29;
 | |
|     PPCVRegister v30;
 | |
|     PPCVRegister v31;
 | |
| #endif
 | |
| #ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL
 | |
|     PPCVRegister v32;
 | |
|     PPCVRegister v33;
 | |
|     PPCVRegister v34;
 | |
|     PPCVRegister v35;
 | |
|     PPCVRegister v36;
 | |
|     PPCVRegister v37;
 | |
|     PPCVRegister v38;
 | |
|     PPCVRegister v39;
 | |
|     PPCVRegister v40;
 | |
|     PPCVRegister v41;
 | |
|     PPCVRegister v42;
 | |
|     PPCVRegister v43;
 | |
|     PPCVRegister v44;
 | |
|     PPCVRegister v45;
 | |
|     PPCVRegister v46;
 | |
|     PPCVRegister v47;
 | |
|     PPCVRegister v48;
 | |
|     PPCVRegister v49;
 | |
|     PPCVRegister v50;
 | |
|     PPCVRegister v51;
 | |
|     PPCVRegister v52;
 | |
|     PPCVRegister v53;
 | |
|     PPCVRegister v54;
 | |
|     PPCVRegister v55;
 | |
|     PPCVRegister v56;
 | |
|     PPCVRegister v57;
 | |
|     PPCVRegister v58;
 | |
|     PPCVRegister v59;
 | |
|     PPCVRegister v60;
 | |
|     PPCVRegister v61;
 | |
|     PPCVRegister v62;
 | |
|     PPCVRegister v63;
 | |
| #endif
 | |
| #ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL
 | |
|     PPCVRegister v64;
 | |
|     PPCVRegister v65;
 | |
|     PPCVRegister v66;
 | |
|     PPCVRegister v67;
 | |
|     PPCVRegister v68;
 | |
|     PPCVRegister v69;
 | |
|     PPCVRegister v70;
 | |
|     PPCVRegister v71;
 | |
|     PPCVRegister v72;
 | |
|     PPCVRegister v73;
 | |
|     PPCVRegister v74;
 | |
|     PPCVRegister v75;
 | |
|     PPCVRegister v76;
 | |
|     PPCVRegister v77;
 | |
|     PPCVRegister v78;
 | |
|     PPCVRegister v79;
 | |
|     PPCVRegister v80;
 | |
|     PPCVRegister v81;
 | |
|     PPCVRegister v82;
 | |
|     PPCVRegister v83;
 | |
|     PPCVRegister v84;
 | |
|     PPCVRegister v85;
 | |
|     PPCVRegister v86;
 | |
|     PPCVRegister v87;
 | |
|     PPCVRegister v88;
 | |
|     PPCVRegister v89;
 | |
|     PPCVRegister v90;
 | |
|     PPCVRegister v91;
 | |
|     PPCVRegister v92;
 | |
|     PPCVRegister v93;
 | |
|     PPCVRegister v94;
 | |
|     PPCVRegister v95;
 | |
|     PPCVRegister v96;
 | |
|     PPCVRegister v97;
 | |
|     PPCVRegister v98;
 | |
|     PPCVRegister v99;
 | |
|     PPCVRegister v100;
 | |
|     PPCVRegister v101;
 | |
|     PPCVRegister v102;
 | |
|     PPCVRegister v103;
 | |
|     PPCVRegister v104;
 | |
|     PPCVRegister v105;
 | |
|     PPCVRegister v106;
 | |
|     PPCVRegister v107;
 | |
|     PPCVRegister v108;
 | |
|     PPCVRegister v109;
 | |
|     PPCVRegister v110;
 | |
|     PPCVRegister v111;
 | |
|     PPCVRegister v112;
 | |
|     PPCVRegister v113;
 | |
|     PPCVRegister v114;
 | |
|     PPCVRegister v115;
 | |
|     PPCVRegister v116;
 | |
|     PPCVRegister v117;
 | |
|     PPCVRegister v118;
 | |
|     PPCVRegister v119;
 | |
|     PPCVRegister v120;
 | |
|     PPCVRegister v121;
 | |
|     PPCVRegister v122;
 | |
|     PPCVRegister v123;
 | |
|     PPCVRegister v124;
 | |
|     PPCVRegister v125;
 | |
|     PPCVRegister v126;
 | |
|     PPCVRegister v127;
 | |
| #endif
 | |
| };
 | |
| 
 | |
| inline uint8_t VectorMaskL[] =
 | |
| {
 | |
|     0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
 | |
|     0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
 | |
|     0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
 | |
|     0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F,
 | |
| };
 | |
| 
 | |
| inline uint8_t VectorMaskR[] =
 | |
| {
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF,
 | |
|     0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF,
 | |
|     0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF,
 | |
| };
 | |
| 
 | |
| inline uint8_t VectorShiftTableL[] =
 | |
| {
 | |
|     0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
 | |
|     0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
 | |
|     0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
 | |
|     0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03,
 | |
|     0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
 | |
|     0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05,
 | |
|     0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06,
 | |
|     0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07,
 | |
|     0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08,
 | |
|     0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
 | |
|     0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A,
 | |
|     0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B,
 | |
|     0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C,
 | |
|     0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D,
 | |
|     0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E,
 | |
|     0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F,
 | |
| };
 | |
| 
 | |
| inline uint8_t VectorShiftTableR[] =
 | |
| {
 | |
|     0x1F, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
 | |
|     0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F,
 | |
|     0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E,
 | |
|     0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D,
 | |
|     0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C,
 | |
|     0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B,
 | |
|     0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A,
 | |
|     0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
 | |
|     0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08,
 | |
|     0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07,
 | |
|     0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06,
 | |
|     0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05,
 | |
|     0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
 | |
|     0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03,
 | |
|     0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
 | |
|     0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
 | |
| };
 | |
| 
 | |
| inline simde__m128i simde_mm_adds_epu32(simde__m128i a, simde__m128i b)
 | |
| {
 | |
|     return simde_mm_add_epi32(a, simde_mm_min_epu32(simde_mm_xor_si128(a, simde_mm_cmpeq_epi32(a, a)), b));
 | |
| }
 | |
| 
 | |
| inline simde__m128i simde_mm_avg_epi8(simde__m128i a, simde__m128i b)
 | |
| {
 | |
|     simde__m128i c = simde_mm_set1_epi8(char(128));
 | |
|     return simde_mm_xor_si128(c, simde_mm_avg_epu8(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
 | |
| }
 | |
| 
 | |
| inline simde__m128i simde_mm_avg_epi16(simde__m128i a, simde__m128i b)
 | |
| {
 | |
|     simde__m128i c = simde_mm_set1_epi16(short(32768));
 | |
|     return simde_mm_xor_si128(c, simde_mm_avg_epu16(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
 | |
| }
 | |
| 
 | |
| inline simde__m128 simde_mm_cvtepu32_ps_(simde__m128i src1)
 | |
| {
 | |
|     simde__m128i xmm1 = simde_mm_add_epi32(src1, simde_mm_set1_epi32(127));
 | |
|     simde__m128i xmm0 = simde_mm_slli_epi32(src1, 31 - 8);
 | |
|     xmm0 = simde_mm_srli_epi32(xmm0, 31);
 | |
|     xmm0 = simde_mm_add_epi32(xmm0, xmm1);
 | |
|     xmm0 = simde_mm_srai_epi32(xmm0, 8);
 | |
|     xmm0 = simde_mm_add_epi32(xmm0, simde_mm_set1_epi32(0x4F800000));
 | |
|     simde__m128 xmm2 = simde_mm_cvtepi32_ps(src1);
 | |
|     return simde_mm_blendv_ps(xmm2, simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(src1));
 | |
| }
 | |
| 
 | |
| inline simde__m128i simde_mm_perm_epi8_(simde__m128i a, simde__m128i b, simde__m128i c)
 | |
| {
 | |
|     simde__m128i d = simde_mm_set1_epi8(0xF);
 | |
|     simde__m128i e = simde_mm_sub_epi8(d, simde_mm_and_si128(c, d));
 | |
|     return simde_mm_blendv_epi8(simde_mm_shuffle_epi8(a, e), simde_mm_shuffle_epi8(b, e), simde_mm_slli_epi32(c, 3));
 | |
| }
 | |
| 
 | |
| inline simde__m128i simde_mm_cmpgt_epu8(simde__m128i a, simde__m128i b)
 | |
| {
 | |
|     simde__m128i c = simde_mm_set1_epi8(char(128));
 | |
|     return simde_mm_cmpgt_epi8(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
 | |
| }
 | |
| 
 | |
| inline simde__m128i simde_mm_cmpgt_epu16(simde__m128i a, simde__m128i b)
 | |
| {
 | |
|     simde__m128i c = simde_mm_set1_epi16(short(32768));
 | |
|     return simde_mm_cmpgt_epi16(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
 | |
| }
 | |
| 
 | |
| inline simde__m128i simde_mm_vctsxs(simde__m128 src1)
 | |
| {
 | |
|     simde__m128 xmm2 = simde_mm_cmpunord_ps(src1, src1);
 | |
|     simde__m128i xmm0 = simde_mm_cvttps_epi32(src1);
 | |
|     simde__m128i xmm1 = simde_mm_cmpeq_epi32(xmm0, simde_mm_set1_epi32(INT_MIN));
 | |
|     xmm1 = simde_mm_andnot_si128(simde_mm_castps_si128(src1), xmm1);
 | |
|     simde__m128 dest = simde_mm_blendv_ps(simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(simde_mm_set1_epi32(INT_MAX)), simde_mm_castsi128_ps(xmm1));
 | |
|     return simde_mm_andnot_si128(simde_mm_castps_si128(xmm2), simde_mm_castps_si128(dest));
 | |
| }
 | |
| 
 | |
| inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b)
 | |
| {
 | |
|     b = simde_mm_srli_epi64(simde_mm_slli_epi64(b, 61), 61);
 | |
|     return simde_mm_castps_si128(simde_mm_insert_ps(simde_mm_castsi128_ps(simde_mm_srl_epi64(a, b)), simde_mm_castsi128_ps(simde_mm_srl_epi64(simde_mm_srli_si128(a, 4), b)), 0x10));
 | |
| }
 | |
| 
 | |
| #if defined(__aarch64__) || defined(_M_ARM64)
 | |
| inline uint64_t __rdtsc()
 | |
| {
 | |
|     uint64_t ret;
 | |
|     asm volatile("mrs %0, cntvct_el0\n\t"
 | |
|                  : "=r"(ret)::"memory");
 | |
|     return ret;
 | |
| }
 | |
| #elif !defined(__x86_64__) && !defined(_M_X64)
 | |
| #   error "Missing implementation for __rdtsc()"
 | |
| #endif
 | |
| 
 | |
| #endif
 |