mirror of
				https://github.com/hedge-dev/XenonRecomp.git
				synced 2025-10-30 07:11:38 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			541 lines
		
	
	
	
		
			18 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			541 lines
		
	
	
	
		
			18 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #pragma once
 | |
| #include <cstdint>
 | |
| #include <cstdlib>
 | |
| #include <cstring>
 | |
| #include <cmath>
 | |
| 
 | |
| #ifdef __clang__
 | |
| #include <x86intrin.h>
 | |
| #define __restrict __restrict__
 | |
| #define _byteswap_ushort __builtin_bswap16
 | |
| #define _byteswap_ulong __builtin_bswap32
 | |
| #define _byteswap_uint64 __builtin_bswap64
 | |
| #define isnan __builtin_isnan
 | |
| #define __assume __builtin_assume
 | |
| #define __unreachable() __builtin_unreachable()
 | |
| #else
 | |
| #include <intrin.h>
 | |
| #define __unreachable() __assume(0)
 | |
| #endif
 | |
| 
 | |
| #define PPC_FUNC(x) extern "C" void x(PPCContext& __restrict ctx, uint8_t* base) noexcept
 | |
| 
 | |
| #define PPC_LOAD_U8(x) *(uint8_t*)(base + (x))
 | |
| #define PPC_LOAD_U16(x) _byteswap_ushort(*(uint16_t*)(base + (x)))
 | |
| #define PPC_LOAD_U32(x) _byteswap_ulong(*(uint32_t*)(base + (x)))
 | |
| #define PPC_LOAD_U64(x) _byteswap_uint64(*(uint64_t*)(base + (x)))
 | |
| 
 | |
| #define PPC_STORE_U8(x, y) *(uint8_t*)(base + (x)) = (y)
 | |
| #define PPC_STORE_U16(x, y) *(uint16_t*)(base + (x)) = _byteswap_ushort(y)
 | |
| #define PPC_STORE_U32(x, y) *(uint32_t*)(base + (x)) = _byteswap_ulong(y)
 | |
| #define PPC_STORE_U64(x, y) *(uint64_t*)(base + (x)) = _byteswap_uint64(y)
 | |
| 
 | |
| typedef void PPCFunc(struct PPCContext& __restrict ctx, uint8_t* base);
 | |
| 
 | |
| struct PPCFuncMapping
 | |
| {
 | |
|     size_t guest;
 | |
|     PPCFunc* host;
 | |
| };
 | |
| 
 | |
| extern "C" PPCFuncMapping PPCFuncMappings[];
 | |
| 
 | |
| struct PPCRegister
 | |
| {
 | |
|     union
 | |
|     {
 | |
|         int8_t s8;
 | |
|         uint8_t u8;
 | |
|         int16_t s16;
 | |
|         uint16_t u16;
 | |
|         int32_t s32;
 | |
|         uint32_t u32;
 | |
|         int64_t s64;
 | |
|         uint64_t u64;
 | |
|         float f32;
 | |
|         double f64;
 | |
|     };
 | |
| };
 | |
| 
 | |
| struct PPCXERRegister
 | |
| {
 | |
|     uint8_t so;
 | |
|     uint8_t ov;
 | |
|     uint8_t ca;
 | |
| };
 | |
| 
 | |
| struct PPCCRRegister
 | |
| {
 | |
|     uint8_t lt;
 | |
|     uint8_t gt;
 | |
|     uint8_t eq;
 | |
|     union
 | |
|     {
 | |
|         uint8_t so;
 | |
|         uint8_t un;
 | |
|     };
 | |
| 
 | |
|     template<typename T>
 | |
|     void compare(T left, T right, const PPCXERRegister& xer)
 | |
|     {
 | |
|         lt = left < right;
 | |
|         gt = left > right;
 | |
|         eq = left == right;
 | |
|         so = xer.so;
 | |
|     }
 | |
| 
 | |
|     void compare(double left, double right)
 | |
|     {
 | |
|         lt = left < right;
 | |
|         gt = left > right;
 | |
|         eq = left == right;
 | |
|         un = isnan(left) || isnan(right);
 | |
|     }
 | |
| 
 | |
|     void setFromMask(__m128 mask, int imm)
 | |
|     {
 | |
|         int m = _mm_movemask_ps(mask);
 | |
|         lt = m == imm; // all equal
 | |
|         gt = 0;
 | |
|         eq = m == 0; // none equal
 | |
|         so = 0;
 | |
|     }
 | |
| 
 | |
|     void setFromMask(__m128i mask, int imm)
 | |
|     {
 | |
|         int m = _mm_movemask_epi8(mask);
 | |
|         lt = m == imm; // all equal
 | |
|         gt = 0;
 | |
|         eq = m == 0; // none equal
 | |
|         so = 0;
 | |
|     }
 | |
| };
 | |
| 
 | |
| struct alignas(0x10) PPCVRegister
 | |
| {
 | |
|     union
 | |
|     {
 | |
|         int8_t s8[16];
 | |
|         uint8_t u8[16];
 | |
|         int16_t s16[8];
 | |
|         uint16_t u16[8];
 | |
|         int32_t s32[4];
 | |
|         uint32_t u32[4];
 | |
|         int64_t s64[2];
 | |
|         uint64_t u64[2];
 | |
|         float f32[4];
 | |
|         double f64[2];
 | |
|     };
 | |
| };
 | |
| 
 | |
| struct CSRRegister
 | |
| {
 | |
|     uint32_t value;
 | |
| 
 | |
|     void storeValue()
 | |
|     {
 | |
|         value = _mm_getcsr();
 | |
|     }
 | |
| 
 | |
|     void setFlushMode(bool enable)
 | |
|     {
 | |
|         uint32_t mask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
 | |
|         uint32_t newValue = enable ? (value | mask) : (value & ~mask);
 | |
|         
 | |
|         if (value != newValue)
 | |
|         {
 | |
|             _mm_setcsr(newValue);
 | |
|             value = newValue;
 | |
|         }
 | |
|     }
 | |
| };
 | |
| 
 | |
| struct PPCContext
 | |
| {
 | |
|     PPCFunc** fn;
 | |
|     uint64_t lr;
 | |
|     PPCRegister ctr;
 | |
|     PPCXERRegister xer;
 | |
|     PPCRegister reserved;
 | |
|     uint32_t msr;
 | |
|     uint32_t fpscr;
 | |
|     CSRRegister csr;
 | |
| 
 | |
|     union
 | |
|     {
 | |
|         struct
 | |
|         {
 | |
|             PPCCRRegister cr0;
 | |
|             PPCCRRegister cr1;
 | |
|             PPCCRRegister cr2;
 | |
|             PPCCRRegister cr3;
 | |
|             PPCCRRegister cr4;
 | |
|             PPCCRRegister cr5;
 | |
|             PPCCRRegister cr6;
 | |
|             PPCCRRegister cr7;
 | |
|         };
 | |
|         PPCCRRegister cr[8];
 | |
|     };
 | |
| 
 | |
|     union
 | |
|     {
 | |
|         struct
 | |
|         {
 | |
|             PPCRegister r0;
 | |
|             PPCRegister r1;
 | |
|             PPCRegister r2;
 | |
|             PPCRegister r3;
 | |
|             PPCRegister r4;
 | |
|             PPCRegister r5;
 | |
|             PPCRegister r6;
 | |
|             PPCRegister r7;
 | |
|             PPCRegister r8;
 | |
|             PPCRegister r9;
 | |
|             PPCRegister r10;
 | |
|             PPCRegister r11;
 | |
|             PPCRegister r12;
 | |
|             PPCRegister r13;
 | |
|             PPCRegister r14;
 | |
|             PPCRegister r15;
 | |
|             PPCRegister r16;
 | |
|             PPCRegister r17;
 | |
|             PPCRegister r18;
 | |
|             PPCRegister r19;
 | |
|             PPCRegister r20;
 | |
|             PPCRegister r21;
 | |
|             PPCRegister r22;
 | |
|             PPCRegister r23;
 | |
|             PPCRegister r24;
 | |
|             PPCRegister r25;
 | |
|             PPCRegister r26;
 | |
|             PPCRegister r27;
 | |
|             PPCRegister r28;
 | |
|             PPCRegister r29;
 | |
|             PPCRegister r30;
 | |
|             PPCRegister r31;
 | |
|         };
 | |
|         PPCRegister r[32];
 | |
|     };
 | |
| 
 | |
|     union
 | |
|     {
 | |
|         struct
 | |
|         {
 | |
|             PPCRegister f0;
 | |
|             PPCRegister f1;
 | |
|             PPCRegister f2;
 | |
|             PPCRegister f3;
 | |
|             PPCRegister f4;
 | |
|             PPCRegister f5;
 | |
|             PPCRegister f6;
 | |
|             PPCRegister f7;
 | |
|             PPCRegister f8;
 | |
|             PPCRegister f9;
 | |
|             PPCRegister f10;
 | |
|             PPCRegister f11;
 | |
|             PPCRegister f12;
 | |
|             PPCRegister f13;
 | |
|             PPCRegister f14;
 | |
|             PPCRegister f15;
 | |
|             PPCRegister f16;
 | |
|             PPCRegister f17;
 | |
|             PPCRegister f18;
 | |
|             PPCRegister f19;
 | |
|             PPCRegister f20;
 | |
|             PPCRegister f21;
 | |
|             PPCRegister f22;
 | |
|             PPCRegister f23;
 | |
|             PPCRegister f24;
 | |
|             PPCRegister f25;
 | |
|             PPCRegister f26;
 | |
|             PPCRegister f27;
 | |
|             PPCRegister f28;
 | |
|             PPCRegister f29;
 | |
|             PPCRegister f30;
 | |
|             PPCRegister f31;
 | |
|         };
 | |
|         PPCRegister f[32];
 | |
|     };
 | |
| 
 | |
|     union
 | |
|     {
 | |
|         struct
 | |
|         {
 | |
|             PPCVRegister v0;
 | |
|             PPCVRegister v1;
 | |
|             PPCVRegister v2;
 | |
|             PPCVRegister v3;
 | |
|             PPCVRegister v4;
 | |
|             PPCVRegister v5;
 | |
|             PPCVRegister v6;
 | |
|             PPCVRegister v7;
 | |
|             PPCVRegister v8;
 | |
|             PPCVRegister v9;
 | |
|             PPCVRegister v10;
 | |
|             PPCVRegister v11;
 | |
|             PPCVRegister v12;
 | |
|             PPCVRegister v13;
 | |
|             PPCVRegister v14;
 | |
|             PPCVRegister v15;
 | |
|             PPCVRegister v16;
 | |
|             PPCVRegister v17;
 | |
|             PPCVRegister v18;
 | |
|             PPCVRegister v19;
 | |
|             PPCVRegister v20;
 | |
|             PPCVRegister v21;
 | |
|             PPCVRegister v22;
 | |
|             PPCVRegister v23;
 | |
|             PPCVRegister v24;
 | |
|             PPCVRegister v25;
 | |
|             PPCVRegister v26;
 | |
|             PPCVRegister v27;
 | |
|             PPCVRegister v28;
 | |
|             PPCVRegister v29;
 | |
|             PPCVRegister v30;
 | |
|             PPCVRegister v31;
 | |
|             PPCVRegister v32;
 | |
|             PPCVRegister v33;
 | |
|             PPCVRegister v34;
 | |
|             PPCVRegister v35;
 | |
|             PPCVRegister v36;
 | |
|             PPCVRegister v37;
 | |
|             PPCVRegister v38;
 | |
|             PPCVRegister v39;
 | |
|             PPCVRegister v40;
 | |
|             PPCVRegister v41;
 | |
|             PPCVRegister v42;
 | |
|             PPCVRegister v43;
 | |
|             PPCVRegister v44;
 | |
|             PPCVRegister v45;
 | |
|             PPCVRegister v46;
 | |
|             PPCVRegister v47;
 | |
|             PPCVRegister v48;
 | |
|             PPCVRegister v49;
 | |
|             PPCVRegister v50;
 | |
|             PPCVRegister v51;
 | |
|             PPCVRegister v52;
 | |
|             PPCVRegister v53;
 | |
|             PPCVRegister v54;
 | |
|             PPCVRegister v55;
 | |
|             PPCVRegister v56;
 | |
|             PPCVRegister v57;
 | |
|             PPCVRegister v58;
 | |
|             PPCVRegister v59;
 | |
|             PPCVRegister v60;
 | |
|             PPCVRegister v61;
 | |
|             PPCVRegister v62;
 | |
|             PPCVRegister v63;
 | |
|             PPCVRegister v64;
 | |
|             PPCVRegister v65;
 | |
|             PPCVRegister v66;
 | |
|             PPCVRegister v67;
 | |
|             PPCVRegister v68;
 | |
|             PPCVRegister v69;
 | |
|             PPCVRegister v70;
 | |
|             PPCVRegister v71;
 | |
|             PPCVRegister v72;
 | |
|             PPCVRegister v73;
 | |
|             PPCVRegister v74;
 | |
|             PPCVRegister v75;
 | |
|             PPCVRegister v76;
 | |
|             PPCVRegister v77;
 | |
|             PPCVRegister v78;
 | |
|             PPCVRegister v79;
 | |
|             PPCVRegister v80;
 | |
|             PPCVRegister v81;
 | |
|             PPCVRegister v82;
 | |
|             PPCVRegister v83;
 | |
|             PPCVRegister v84;
 | |
|             PPCVRegister v85;
 | |
|             PPCVRegister v86;
 | |
|             PPCVRegister v87;
 | |
|             PPCVRegister v88;
 | |
|             PPCVRegister v89;
 | |
|             PPCVRegister v90;
 | |
|             PPCVRegister v91;
 | |
|             PPCVRegister v92;
 | |
|             PPCVRegister v93;
 | |
|             PPCVRegister v94;
 | |
|             PPCVRegister v95;
 | |
|             PPCVRegister v96;
 | |
|             PPCVRegister v97;
 | |
|             PPCVRegister v98;
 | |
|             PPCVRegister v99;
 | |
|             PPCVRegister v100;
 | |
|             PPCVRegister v101;
 | |
|             PPCVRegister v102;
 | |
|             PPCVRegister v103;
 | |
|             PPCVRegister v104;
 | |
|             PPCVRegister v105;
 | |
|             PPCVRegister v106;
 | |
|             PPCVRegister v107;
 | |
|             PPCVRegister v108;
 | |
|             PPCVRegister v109;
 | |
|             PPCVRegister v110;
 | |
|             PPCVRegister v111;
 | |
|             PPCVRegister v112;
 | |
|             PPCVRegister v113;
 | |
|             PPCVRegister v114;
 | |
|             PPCVRegister v115;
 | |
|             PPCVRegister v116;
 | |
|             PPCVRegister v117;
 | |
|             PPCVRegister v118;
 | |
|             PPCVRegister v119;
 | |
|             PPCVRegister v120;
 | |
|             PPCVRegister v121;
 | |
|             PPCVRegister v122;
 | |
|             PPCVRegister v123;
 | |
|             PPCVRegister v124;
 | |
|             PPCVRegister v125;
 | |
|             PPCVRegister v126;
 | |
|             PPCVRegister v127;
 | |
|         };
 | |
|         PPCVRegister v[128];
 | |
|     };
 | |
| };
 | |
| 
 | |
| inline uint8_t VectorMaskL[] =
 | |
| {
 | |
|     0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
 | |
|     0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
 | |
|     0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
 | |
|     0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E,
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F,
 | |
| };
 | |
| 
 | |
| inline uint8_t VectorMaskR[] =
 | |
| {
 | |
|     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
 | |
|     0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF,
 | |
|     0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF,
 | |
|     0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF,
 | |
| };
 | |
| 
 | |
| inline uint8_t VectorShiftTableL[] =
 | |
| {
 | |
|     0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
 | |
|     0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
 | |
|     0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
 | |
|     0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03,
 | |
|     0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
 | |
|     0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05,
 | |
|     0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06,
 | |
|     0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07,
 | |
|     0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08,
 | |
|     0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
 | |
|     0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A,
 | |
|     0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B,
 | |
|     0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C,
 | |
|     0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D,
 | |
|     0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E,
 | |
|     0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F,
 | |
| };
 | |
| 
 | |
| inline uint8_t VectorShiftTableR[] =
 | |
| {
 | |
|     0x1F, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
 | |
|     0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F,
 | |
|     0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E,
 | |
|     0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D,
 | |
|     0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C,
 | |
|     0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B,
 | |
|     0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A,
 | |
|     0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
 | |
|     0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08,
 | |
|     0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07,
 | |
|     0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06,
 | |
|     0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05,
 | |
|     0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
 | |
|     0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03,
 | |
|     0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
 | |
|     0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
 | |
| };
 | |
| 
 | |
| inline __m128i _mm_adds_epu32(__m128i a, __m128i b) 
 | |
| {
 | |
|     return _mm_add_epi32(_mm_min_epu32(a, _mm_xor_si128(b, _mm_cmpeq_epi32(b, b))), b);
 | |
| }
 | |
| 
 | |
| inline __m128i _mm_avg_epi8(__m128i a, __m128i b)
 | |
| {
 | |
|     __m128i c = _mm_set1_epi8(char(128));
 | |
|     return _mm_xor_si128(c, _mm_avg_epu8(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
 | |
| }
 | |
| 
 | |
| inline __m128i _mm_avg_epi16(__m128i a, __m128i b)
 | |
| {
 | |
|     __m128i c = _mm_set1_epi16(short(32768));
 | |
|     return _mm_xor_si128(c, _mm_avg_epu16(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
 | |
| }
 | |
| 
 | |
| inline __m128 _mm_cvtepu32_ps_(__m128i v)
 | |
| {
 | |
|     __m128i v2 = _mm_srli_epi32(v, 1);
 | |
|     __m128i v1 = _mm_sub_epi32(v, v2);
 | |
|     __m128 v2f = _mm_cvtepi32_ps(v2);
 | |
|     __m128 v1f = _mm_cvtepi32_ps(v1);
 | |
|     return _mm_add_ps(v2f, v1f);
 | |
| }
 | |
| 
 | |
| inline __m128i _mm_perm_epi8_(__m128i a, __m128i b, __m128i c)
 | |
| {
 | |
|     __m128i d = _mm_set1_epi8(0xF);
 | |
|     __m128i e = _mm_sub_epi8(d, _mm_and_si128(c, d));
 | |
|     return _mm_blendv_epi8(_mm_shuffle_epi8(a, e), _mm_shuffle_epi8(b, e), _mm_slli_epi32(c, 3));
 | |
| }
 | |
| 
 | |
| inline __m128i _mm_cmpgt_epu8(__m128i a, __m128i b)
 | |
| {
 | |
|     __m128i c = _mm_set1_epi8(char(128));
 | |
|     return _mm_cmpgt_epi8(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
 | |
| }
 | |
| 
 | |
| inline __m128i _mm_cmpgt_epu16(__m128i a, __m128i b)
 | |
| {
 | |
|     __m128i c = _mm_set1_epi16(short(32768));
 | |
|     return _mm_cmpgt_epi16(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
 | |
| }
 | |
| 
 | |
| inline __m128i _mm_vctsxs(__m128 a)
 | |
| {
 | |
|     __m128i result = _mm_cvttps_epi32(a);
 | |
| 
 | |
|     __m128 max_val = _mm_set1_ps(2147483648.0f);
 | |
|     __m128 cmp_mask = _mm_cmpgt_ps(a, max_val);
 | |
| 
 | |
|     result = _mm_xor_si128(result, _mm_castps_si128(cmp_mask));
 | |
| 
 | |
|     __m128 ord_mask = _mm_cmpord_ps(a, a);
 | |
|     result = _mm_and_si128(result, _mm_castps_si128(ord_mask));
 | |
| 
 | |
|     return result;
 | |
| }
 | |
| 
 | |
| inline __m128i _mm_vsr(__m128i a, __m128i b)
 | |
| {
 | |
|     b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61);
 | |
|     return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10));
 | |
| }
 | 
