From cdfa0907fdd95833d1d55c23dac480226c4ba600 Mon Sep 17 00:00:00 2001 From: DeaTh-G <55578911+DeaTh-G@users.noreply.github.com> Date: Sun, 13 Oct 2024 18:12:23 +0200 Subject: [PATCH 01/18] Add more instructions regarding Bakugan Battle Brawlers --- PowerRecomp/recompiler.cpp | 219 ++++++++++++++++++++++++++++++++++++- 1 file changed, 218 insertions(+), 1 deletion(-) diff --git a/PowerRecomp/recompiler.cpp b/PowerRecomp/recompiler.cpp index e330293..94379f2 100644 --- a/PowerRecomp/recompiler.cpp +++ b/PowerRecomp/recompiler.cpp @@ -703,6 +703,10 @@ bool Recompiler::Recompile( // no op break; + case PPC_INST_DCBST: + // no op + break; + case PPC_INST_DCBTST: // no op break; @@ -930,6 +934,12 @@ bool Recompiler::Recompile( println("{}.u32);", r(insn.operands[2])); break; + case PPC_INST_LBZUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U8({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_LD: print("\t{}.u64 = PPC_LOAD_U64(", r(insn.operands[0])); if (insn.operands[2] != 0) @@ -958,6 +968,12 @@ bool Recompiler::Recompile( println("{}.u32);", r(insn.operands[2])); break; + case PPC_INST_LDUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U64({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_LFD: printSetFlushMode(false); print("\t{}.u64 = PPC_LOAD_U64(", f(insn.operands[0])); @@ -966,6 +982,13 @@ bool Recompiler::Recompile( println("{});", int32_t(insn.operands[1])); break; + case PPC_INST_LFDU: + printSetFlushMode(false); + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U64({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + case PPC_INST_LFDX: printSetFlushMode(false); print("\t{}.u64 = PPC_LOAD_U64(", f(insn.operands[0])); @@ -974,6 +997,13 @@ bool Recompiler::Recompile( println("{}.u32);", r(insn.operands[2])); break; + case PPC_INST_LFDUX: + printSetFlushMode(false); + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U64({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_LFS: printSetFlushMode(false); print("\t{}.u32 = PPC_LOAD_U32(", temp()); @@ -983,6 +1013,14 @@ bool Recompiler::Recompile( println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp()); break; + case PPC_INST_LFSU: + printSetFlushMode(false); + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u32 = PPC_LOAD_U32({});", temp(), ea()); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp()); + break; + case PPC_INST_LFSX: printSetFlushMode(false); print("\t{}.u32 = PPC_LOAD_U32(", temp()); @@ -992,6 +1030,14 @@ bool Recompiler::Recompile( println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp()); break; + case PPC_INST_LFSUX: + printSetFlushMode(false); + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u32 = PPC_LOAD_U32({});", temp(), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp()); + break; + case PPC_INST_LHA: print("\t{}.s64 = int16_t(PPC_LOAD_U16(", r(insn.operands[0])); if (insn.operands[2] != 0) @@ -999,6 +1045,12 @@ bool Recompiler::Recompile( println("{}));", int32_t(insn.operands[1])); break; + case PPC_INST_LHAU: + print("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + print("\t{}.s64 = int16_t(PPC_LOAD_U16({}));", r(insn.operands[0]), ea()); + print("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + case PPC_INST_LHAX: print("\t{}.s64 = int16_t(PPC_LOAD_U16(", r(insn.operands[0])); if (insn.operands[1] != 0) @@ -1013,6 +1065,12 @@ bool Recompiler::Recompile( println("{});", int32_t(insn.operands[1])); break; + case PPC_INST_LHZU: + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U16({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + case PPC_INST_LHZX: print("\t{}.u64 = PPC_LOAD_U16(", r(insn.operands[0])); if (insn.operands[1] != 0) @@ -1020,6 +1078,12 @@ bool Recompiler::Recompile( println("{}.u32);", r(insn.operands[2])); break; + case PPC_INST_LHZUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U16({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_LI: println("\t{}.s64 = {};", r(insn.operands[0]), int32_t(insn.operands[1])); break; @@ -1032,6 +1096,7 @@ bool Recompiler::Recompile( case PPC_INST_LVEWX128: case PPC_INST_LVX: case PPC_INST_LVX128: + case PPC_INST_LVEHX: // NOTE: for endian swapping, we reverse the whole vector instead of individual elements. // this is accounted for in every instruction (eg. dp3 sums yzw instead of xyz) print("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ((", v(insn.operands[0])); @@ -1127,6 +1192,12 @@ bool Recompiler::Recompile( println("{}.u32);", r(insn.operands[2])); break; + case PPC_INST_LWZUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U32({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_MFCR: for (size_t i = 0; i < 32; i++) { @@ -1388,6 +1459,12 @@ bool Recompiler::Recompile( println("{}.u32, {}.u8);", r(insn.operands[2]), r(insn.operands[0])); break; + case PPC_INST_STBUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\tPPC_STORE_U8({}, {}.u8);", ea(), r(insn.operands[0])); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_STD: print("{}", mmioStore() ? "\tPPC_MM_STORE_U64(" : "\tPPC_STORE_U64("); if (insn.operands[2] != 0) @@ -1418,6 +1495,12 @@ bool Recompiler::Recompile( println("{}.u32, {}.u64);", r(insn.operands[2]), r(insn.operands[0])); break; + case PPC_INST_STDUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\tPPC_STORE_U64({}, {}.u64);", ea(), r(insn.operands[0])); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_STFD: printSetFlushMode(false); print("{}", mmioStore() ? "\tPPC_MM_STORE_U64(" : "\tPPC_STORE_U64("); @@ -1426,6 +1509,13 @@ bool Recompiler::Recompile( println("{}, {}.u64);", int32_t(insn.operands[1]), f(insn.operands[0])); break; + case PPC_INST_STFDU: + printSetFlushMode(false); + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\tPPC_STORE_U64({}, {}.u64);", ea(), r(insn.operands[0])); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + case PPC_INST_STFDX: printSetFlushMode(false); print("{}", mmioStore() ? "\tPPC_MM_STORE_U64(" : "\tPPC_STORE_U64("); @@ -1451,6 +1541,14 @@ bool Recompiler::Recompile( println("{}, {}.u32);", int32_t(insn.operands[1]), temp()); break; + case PPC_INST_STFSU: + printSetFlushMode(false); + println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0])); + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\tPPC_STORE_U32({}, {}.u32);", ea(), temp()); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + case PPC_INST_STFSX: printSetFlushMode(false); println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0])); @@ -1460,6 +1558,14 @@ bool Recompiler::Recompile( println("{}.u32, {}.u32);", r(insn.operands[2]), temp()); break; + case PPC_INST_STFSUX: + printSetFlushMode(false); + println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0])); + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\tPPC_STORE_U32({}, {}.u32);", ea(), temp()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_STH: print("{}", mmioStore() ? "\tPPC_MM_STORE_U16(" : "\tPPC_STORE_U16("); if (insn.operands[2] != 0) @@ -1467,6 +1573,18 @@ bool Recompiler::Recompile( println("{}, {}.u16);", int32_t(insn.operands[1]), r(insn.operands[0])); break; + case PPC_INST_STHU: + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\tPPC_STORE_U16({}, {}.u16);", ea(), r(insn.operands[0])); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + + case PPC_INST_STHUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\tPPC_STORE_U16({}, {}.u16);", ea(), r(insn.operands[0])); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_STHBRX: print("{}", mmioStore() ? "\tPPC_MM_STORE_U16(" : "\tPPC_STORE_U16("); if (insn.operands[1] != 0) @@ -1635,10 +1753,18 @@ bool Recompiler::Recompile( println("\t_mm_store_ps({}.f32, _mm_add_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VADDSBS: + println("\t_mm_store_si128((__m128i*){}.s8, _mm_adds_epi8(_mm_load_si128((__m128i*){}.s8), _mm_load_si128((__m128i*){}.s8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VADDSHS: println("\t_mm_store_si128((__m128i*){}.s16, _mm_adds_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VADDSWS: + println("\t_mm_store_si128((__m128i*){}.s32, _mm_adds_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VADDUBM: println("\t_mm_store_si128((__m128i*){}.u8, _mm_add_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; @@ -1680,6 +1806,10 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VAVGUH: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VCTSXS: case PPC_INST_VCFPSXWS128: printSetFlushMode(true); @@ -1743,6 +1873,12 @@ bool Recompiler::Recompile( println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); break; + case PPC_INST_VCMPEQUH: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0])); + break; + case PPC_INST_VCMPEQUW: case PPC_INST_VCMPEQUW128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); @@ -1768,10 +1904,26 @@ bool Recompiler::Recompile( case PPC_INST_VCMPGTUB: println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGTUH: println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0])); + break; + + case PPC_INST_VCMPGTSH: + println("\t_mm_store_si128((__m128i*){}.s8, _mm_cmpgt_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.s16), 0xFFFF);", cr(6), v(insn.operands[0])); + break; + + case PPC_INST_VCMPGTSW: + println("\t_mm_store_si128((__m128i*){}.s8, _mm_cmpgt_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.s32), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VEXPTEFP: @@ -1803,10 +1955,18 @@ bool Recompiler::Recompile( println("\t_mm_store_ps({}.f32, _mm_max_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VMAXSH: + println("\t_mm_store_si128((__m128i*){}.u16, _mm_max_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VMAXSW: println("\t_mm_store_si128((__m128i*){}.u32, _mm_max_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VMINSH: + println("\t_mm_store_si128((__m128i*){}.u16, _mm_max_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VMINFP: case PPC_INST_VMINFP128: printSetFlushMode(true); @@ -1915,11 +2075,26 @@ bool Recompiler::Recompile( } break; + case PPC_INST_VPKSHSS: + case PPC_INST_VPKSHSS128: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_packs_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + + case PPC_INST_VPKSWSS: + case PPC_INST_VPKSWSS128: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_packs_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + case PPC_INST_VPKSHUS: case PPC_INST_VPKSHUS128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; + case PPC_INST_VPKSWUS: + case PPC_INST_VPKSWUS128: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + case PPC_INST_VREFP: case PPC_INST_VREFP128: // TODO: see if we can use rcp safely @@ -1961,6 +2136,7 @@ bool Recompiler::Recompile( break; case PPC_INST_VSEL: + case PPC_INST_VSEL128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_or_si128(_mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2])); break; @@ -1970,6 +2146,12 @@ bool Recompiler::Recompile( println("\t{}.u8[{}] = {}.u8[{}] << ({}.u8[{}] & 0x7);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i); break; + case PPC_INST_VSLH: + // TODO: vectorize + for (size_t i = 0; i < 8; i++) + println("\t{}.u16[{}] = {}.u16[{}] << ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i); + break; + case PPC_INST_VSLDOI: case PPC_INST_VSLDOI128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_alignr_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]); @@ -2003,6 +2185,10 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]); break; + case PPC_INST_VSPLTISH: + println("\t_mm_store_si128((__m128i*){}.u16, _mm_set1_epi16(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); + break; + case PPC_INST_VSPLTISW: case PPC_INST_VSPLTISW128: println("\t_mm_store_si128((__m128i*){}.u32, _mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); @@ -2022,6 +2208,18 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_vsr(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VSRAB: + // TODO: vectorize, ensure endianness is correct + for (size_t i = 0; i < 16; i++) + println("\t{}.s8[{}] = {}.s8[{}] >> ({}.u8[{}] & 0x7);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 4); + break; + + case PPC_INST_VSRAH: + // TODO: vectorize, ensure endianness is correct + for (size_t i = 0; i < 8; i++) + println("\t{}.s16[{}] = {}.s16[{}] >> ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 4); + break; + case PPC_INST_VSRAW: case PPC_INST_VSRAW128: // TODO: vectorize, ensure endianness is correct @@ -2029,6 +2227,12 @@ bool Recompiler::Recompile( println("\t{}.s32[{}] = {}.s32[{}] >> ({}.u8[{}] & 0x1F);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 4); break; + case PPC_INST_VSRH: + // TODO: vectorize, ensure endianness is correct + for (size_t i = 0; i < 8; i++) + println("\t{}.u16[{}] = {}.u16[{}] >> ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 4); + break; + case PPC_INST_VSRW: case PPC_INST_VSRW128: // TODO: vectorize, ensure endianness is correct @@ -2042,6 +2246,15 @@ bool Recompiler::Recompile( println("\t_mm_store_ps({}.f32, _mm_sub_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VSUBSHS: + // TODO: vectorize + for (size_t i = 0; i < 8; i++) + { + println("\t{}.s64 = int64_t({}.s16[{}]) - int64_t({}.s16[{}]);", temp(), v(insn.operands[1]), i, v(insn.operands[2]), i); + println("\t{}.s16[{}] = {}.s64 > SHRT_MAX ? SHRT_MAX : {}.s64 < SHRT_MIN ? SHRT_MIN : {}.s64;", v(insn.operands[0]), i, temp(), temp(), temp()); + } + break; + case PPC_INST_VSUBSWS: // TODO: vectorize for (size_t i = 0; i < 4; i++) @@ -2055,8 +2268,12 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_subs_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VSUBUBM: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VSUBUHM: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi16(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VUPKD3D128: From db477c131d78c24391f86b26f72bcf84aaeb9414 Mon Sep 17 00:00:00 2001 From: DeaTh-G <55578911+DeaTh-G@users.noreply.github.com> Date: Sun, 13 Oct 2024 19:29:33 +0200 Subject: [PATCH 02/18] Fix indexing on certain instructions --- PowerRecomp/recompiler.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PowerRecomp/recompiler.cpp b/PowerRecomp/recompiler.cpp index 94379f2..56ffd6a 100644 --- a/PowerRecomp/recompiler.cpp +++ b/PowerRecomp/recompiler.cpp @@ -2211,13 +2211,13 @@ bool Recompiler::Recompile( case PPC_INST_VSRAB: // TODO: vectorize, ensure endianness is correct for (size_t i = 0; i < 16; i++) - println("\t{}.s8[{}] = {}.s8[{}] >> ({}.u8[{}] & 0x7);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 4); + println("\t{}.s8[{}] = {}.s8[{}] >> ({}.u8[{}] & 0x7);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i); break; case PPC_INST_VSRAH: // TODO: vectorize, ensure endianness is correct for (size_t i = 0; i < 8; i++) - println("\t{}.s16[{}] = {}.s16[{}] >> ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 4); + println("\t{}.s16[{}] = {}.s16[{}] >> ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 2); break; case PPC_INST_VSRAW: @@ -2230,7 +2230,7 @@ bool Recompiler::Recompile( case PPC_INST_VSRH: // TODO: vectorize, ensure endianness is correct for (size_t i = 0; i < 8; i++) - println("\t{}.u16[{}] = {}.u16[{}] >> ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 4); + println("\t{}.u16[{}] = {}.u16[{}] >> ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 2); break; case PPC_INST_VSRW: From e33e0af159307165d60a3f54c79aa69fc2437622 Mon Sep 17 00:00:00 2001 From: DeaTh-G <55578911+DeaTh-G@users.noreply.github.com> Date: Sun, 3 Nov 2024 15:42:24 +0100 Subject: [PATCH 03/18] add more basic instructions --- PowerRecomp/recompiler.cpp | 66 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) diff --git a/PowerRecomp/recompiler.cpp b/PowerRecomp/recompiler.cpp index 56ffd6a..b5568f3 100644 --- a/PowerRecomp/recompiler.cpp +++ b/PowerRecomp/recompiler.cpp @@ -426,6 +426,13 @@ bool Recompiler::Recompile( println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; + case PPC_INST_ADDC: + println("\t{}.ca = {}.u32 >= ~{}.u32;", xer(), r(insn.operands[2]), r(insn.operands[1])); + println("\t{}.u64 = {}.u64 + {}.u64;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_ADDE: println("\t{}.u8 = ({}.u32 + {}.u32 < {}.u32) | ({}.u32 + {}.u32 + {}.ca < {}.ca);", temp(), r(insn.operands[1]), r(insn.operands[2]), r(insn.operands[1]), r(insn.operands[1]), r(insn.operands[2]), xer(), xer()); println("\t{}.u64 = {}.u64 + {}.u64 + {}.ca;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]), xer()); @@ -434,6 +441,14 @@ bool Recompiler::Recompile( println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; + case PPC_INST_ADDME: + println("\t{}.u8 = ({}.u32 - 1 < {}.u32) | ({}.u32 - 1 + {}.ca < {}.ca);", temp(), r(insn.operands[1]), r(insn.operands[1]), r(insn.operands[1]), xer(), xer()); + println("\t{}.u64 = {}.u64 - 1 + {}.ca;", r(insn.operands[0]), r(insn.operands[1]), xer()); + println("\t{}.ca = {}.u8;", xer(), temp()); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_ADDI: print("\t{}.s64 = ", r(insn.operands[0])); if (insn.operands[1] != 0) @@ -547,6 +562,14 @@ bool Recompiler::Recompile( println("\tif ({}.u32 == 0) goto loc_{:X};", ctr(), insn.operands[0]); break; + case PPC_INST_BDZF: + { + constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" }; + println("\t--{}.u64;", ctr()); + println("\tif ({}.u32 == 0 && !{}.{}) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], insn.operands[1]); + break; + } + case PPC_INST_BDZLR: println("\t--{}.u64;", ctr()); println("\tif ({}.u32 == 0) return;", ctr(), insn.operands[0]); @@ -558,10 +581,20 @@ bool Recompiler::Recompile( break; case PPC_INST_BDNZF: - // NOTE: assuming eq here as a shortcut because all the instructions in the game do that + { + constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" }; println("\t--{}.u64;", ctr()); - println("\tif ({}.u32 != 0 && !{}.eq) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), insn.operands[1]); + println("\tif ({}.u32 != 0 && !{}.{}) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], insn.operands[1]); break; + } + + case PPC_INST_BDNZT: + { + constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" }; + println("\t--{}.u64;", ctr()); + println("\tif ({}.u32 != 0 && {}.{}) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], insn.operands[1]); + break; + } case PPC_INST_BEQ: printConditionalBranch(false, "eq"); @@ -691,6 +724,20 @@ bool Recompiler::Recompile( println("\t{}.u64 = __lzcnt({}.u32);", r(insn.operands[0]), r(insn.operands[1])); break; + case PPC_INST_CROR: + { + constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" }; + println("\t{}.{} = {}.{} | {}.{};", cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], cr(insn.operands[1] / 4), fields[insn.operands[1] % 4], cr(insn.operands[2] / 4), fields[insn.operands[2] % 4]); + break; + } + + case PPC_INST_CRORC: + { + constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" }; + println("\t{}.{} = {}.{} | (~{}.{} & 1);", cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], cr(insn.operands[1] / 4), fields[insn.operands[1] % 4], cr(insn.operands[2] / 4), fields[insn.operands[2] % 4]); + break; + } + case PPC_INST_DB16CYC: // no op break; @@ -751,6 +798,13 @@ bool Recompiler::Recompile( // no op break; + case PPC_INST_EQV: + println("\t{}.u64 = ~({}.u32 ^ {}.u32);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + + break; + case PPC_INST_EXTSB: println("\t{}.s64 = {}.s8;", r(insn.operands[0]), r(insn.operands[1])); if (strchr(insn.opcode->name, '.')) @@ -1718,6 +1772,14 @@ bool Recompiler::Recompile( println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; + case PPC_INST_SUBFZE: + println("\t{}.u8 = (~{}.u32 < ~{}.u32) | (~{}.u32 + {}.ca < {}.ca);", temp(), r(insn.operands[1]), r(insn.operands[1]), r(insn.operands[1]), xer(), xer()); + println("\t{}.u64 = ~{}.u64 + {}.ca;", r(insn.operands[0]), r(insn.operands[1]), xer()); + println("\t{}.ca = {}.u8;", xer(), temp()); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_SUBFIC: println("\t{}.ca = {}.u32 <= {};", xer(), r(insn.operands[1]), insn.operands[2]); println("\t{}.s64 = {} - {}.s64;", r(insn.operands[0]), int32_t(insn.operands[2]), r(insn.operands[1])); From c8ecd79c991445f05b4310e78a42ecc691b309a9 Mon Sep 17 00:00:00 2001 From: DeaTh-G <55578911+DeaTh-G@users.noreply.github.com> Date: Sun, 3 Nov 2024 19:16:26 +0100 Subject: [PATCH 04/18] Fix instruction implementations based on unit tests --- PowerRecomp/recompiler.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/PowerRecomp/recompiler.cpp b/PowerRecomp/recompiler.cpp index b5568f3..65b3e12 100644 --- a/PowerRecomp/recompiler.cpp +++ b/PowerRecomp/recompiler.cpp @@ -427,7 +427,7 @@ bool Recompiler::Recompile( break; case PPC_INST_ADDC: - println("\t{}.ca = {}.u32 >= ~{}.u32;", xer(), r(insn.operands[2]), r(insn.operands[1])); + println("\t{}.ca = {}.u32 > ~{}.u32;", xer(), r(insn.operands[2]), r(insn.operands[1])); println("\t{}.u64 = {}.u64 + {}.u64;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); if (strchr(insn.opcode->name, '.')) println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); @@ -799,10 +799,9 @@ bool Recompiler::Recompile( break; case PPC_INST_EQV: - println("\t{}.u64 = ~({}.u32 ^ {}.u32);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = ~({}.u64 ^ {}.u64);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); if (strchr(insn.opcode->name, '.')) println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); - break; case PPC_INST_EXTSB: @@ -2026,7 +2025,7 @@ bool Recompiler::Recompile( break; case PPC_INST_VMINSH: - println("\t_mm_store_si128((__m128i*){}.u16, _mm_max_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\t_mm_store_si128((__m128i*){}.u16, _mm_min_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMINFP: @@ -2211,7 +2210,7 @@ bool Recompiler::Recompile( case PPC_INST_VSLH: // TODO: vectorize for (size_t i = 0; i < 8; i++) - println("\t{}.u16[{}] = {}.u16[{}] << ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i); + println("\t{}.u16[{}] = {}.u16[{}] << ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 2); break; case PPC_INST_VSLDOI: @@ -2743,7 +2742,7 @@ void Recompiler::SaveCurrentOutData(const std::string_view& name) bool shouldWrite = true; // Check if an identical file already exists first to not trigger recompilation - std::string filePath = std::format("{}/{}/{}", config.directoryPath, config.outDirectoryPath, name.empty() ? cppName : name); + std::string filePath = std::format("{}{}/{}", config.directoryPath, config.outDirectoryPath, name.empty() ? cppName : name); FILE* f = fopen(filePath.c_str(), "rb"); if (f) { From a382cd56532cb2b7311e39840e15dc54e64f81ca Mon Sep 17 00:00:00 2001 From: DeaTh-G <55578911+DeaTh-G@users.noreply.github.com> Date: Sun, 3 Nov 2024 20:27:53 +0100 Subject: [PATCH 05/18] add vpkuhus implementation --- PowerRecomp/recompiler.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/PowerRecomp/recompiler.cpp b/PowerRecomp/recompiler.cpp index 65b3e12..8593e30 100644 --- a/PowerRecomp/recompiler.cpp +++ b/PowerRecomp/recompiler.cpp @@ -2156,6 +2156,16 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; + case PPC_INST_VPKUHUS: + case PPC_INST_VPKUHUS128: + for (size_t i = 0; i < 8; i++) + { + println("\t{0}.u8[{1}] = {2}.u16[{1}] > UCHAR_MAX ? UCHAR_MAX : {2}.u16[{1}];", vTemp(), i, v(insn.operands[2])); + println("\t{0}.u8[{1}] = {2}.u16[{3}] > UCHAR_MAX ? UCHAR_MAX : {2}.u16[{3}];", vTemp(), i + 8, v(insn.operands[1]), i); + } + println("{} = {};", v(insn.operands[0]), vTemp()); + break; + case PPC_INST_VREFP: case PPC_INST_VREFP128: // TODO: see if we can use rcp safely From 7a3db6837f959a012bf948365ca4b1326a591184 Mon Sep 17 00:00:00 2001 From: DeaTh-G <55578911+DeaTh-G@users.noreply.github.com> Date: Sun, 3 Nov 2024 21:27:33 +0100 Subject: [PATCH 06/18] add remaining altivec instructions --- PowerRecomp/recompiler.cpp | 20 +++++++++++++++++++- PowerUtils/ppc_context.h | 13 +++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/PowerRecomp/recompiler.cpp b/PowerRecomp/recompiler.cpp index 8593e30..b1dbd8b 100644 --- a/PowerRecomp/recompiler.cpp +++ b/PowerRecomp/recompiler.cpp @@ -1881,6 +1881,16 @@ bool Recompiler::Recompile( println("_mm_load_ps({}.f32)));", v(insn.operands[1])); break; + case PPC_INST_VCTUXS: + case PPC_INST_VCFPUXWS128: + printSetFlushMode(true); + print("\t_mm_store_si128((__m128i*){}.u32, _mm_vctuxs(", v(insn.operands[0])); + if (insn.operands[2] != 0) + println("_mm_mul_ps(_mm_load_ps({}.f32), _mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]); + else + println("_mm_load_ps({}.f32)));", v(insn.operands[1])); + break; + case PPC_INST_VCFSX: case PPC_INST_VCSXWFP128: { @@ -2198,6 +2208,14 @@ bool Recompiler::Recompile( break; } + case PPC_INST_VRLH: + for (size_t i = 0; i < 8; i++) + { + println("\t{0}.u16[{1}] = ({2}.u16[{1}] << ({3}.u16[{1}] & 0xF)) | ({2}.u16[{1}] >> (16 - ({3}.u16[{1}] & 0xF)));", vTemp(), i, v(insn.operands[1]), v(insn.operands[2])); + } + println("{} = {};", v(insn.operands[0]), vTemp()); + break; + case PPC_INST_VRSQRTEFP: case PPC_INST_VRSQRTEFP128: // TODO: see if we can use rsqrt safely @@ -2752,7 +2770,7 @@ void Recompiler::SaveCurrentOutData(const std::string_view& name) bool shouldWrite = true; // Check if an identical file already exists first to not trigger recompilation - std::string filePath = std::format("{}{}/{}", config.directoryPath, config.outDirectoryPath, name.empty() ? cppName : name); + std::string filePath = std::format("{}/{}/{}", config.directoryPath, config.outDirectoryPath, name.empty() ? cppName : name); FILE* f = fopen(filePath.c_str(), "rb"); if (f) { diff --git a/PowerUtils/ppc_context.h b/PowerUtils/ppc_context.h index 64c257d..727492b 100644 --- a/PowerUtils/ppc_context.h +++ b/PowerUtils/ppc_context.h @@ -644,6 +644,19 @@ inline __m128i _mm_vctsxs(__m128 src1) return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest)); } +inline __m128i _mm_vctuxs(__m128 src1) +{ + __m128 xmm0 = _mm_max_ps(src1, _mm_set1_epi32(0)); + __m128 xmm1 = _mm_cmpge_ps(xmm0, _mm_set1_ps((float)0x80000000)); + __m128 xmm2 = _mm_sub_ps(xmm0, _mm_set1_ps((float)0x80000000)); + xmm0 = _mm_blendv_ps(xmm0, xmm2, xmm1); + __m128i dest = _mm_cvttps_epi32(xmm0); + xmm0 = _mm_cmpeq_epi32(dest, _mm_set1_epi32(INT_MIN)); + xmm1 = _mm_and_si128(xmm1, _mm_set1_epi32(INT_MIN)); + dest = _mm_add_epi32(dest, xmm1); + return _mm_or_si128(dest, xmm0); +} + inline __m128i _mm_vsr(__m128i a, __m128i b) { b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61); From 3dcf708c4db74993fbacd6cb2b7eeb0f762c8217 Mon Sep 17 00:00:00 2001 From: DeaTh-G Date: Thu, 14 Nov 2024 17:34:45 +0100 Subject: [PATCH 07/18] fix vaddsws implementation --- PowerRecomp/recompiler.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/PowerRecomp/recompiler.cpp b/PowerRecomp/recompiler.cpp index b1dbd8b..11649c6 100644 --- a/PowerRecomp/recompiler.cpp +++ b/PowerRecomp/recompiler.cpp @@ -1823,7 +1823,12 @@ bool Recompiler::Recompile( break; case PPC_INST_VADDSWS: - println("\t_mm_store_si128((__m128i*){}.s32, _mm_adds_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + // TODO: vectorize + for (size_t i = 0; i < 4; i++) + { + println("\t{}.s64 = int64_t({}.s32[{}]) + int64_t({}.s32[{}]);", temp(), v(insn.operands[1]), i, v(insn.operands[2]), i); + println("\t{}.s32[{}] = {}.s64 > INT_MAX ? INT_MAX : {}.s64 < INT_MIN ? INT_MIN : {}.s64;", v(insn.operands[0]), i, temp(), temp(), temp()); + } break; case PPC_INST_VADDUBM: From 943fcafd37187fcae5fab31294af21b4ab4994b5 Mon Sep 17 00:00:00 2001 From: DeaTh-G Date: Sun, 24 Nov 2024 10:50:58 +0100 Subject: [PATCH 08/18] make store instructions check for mmio --- PowerRecomp/recompiler.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/PowerRecomp/recompiler.cpp b/PowerRecomp/recompiler.cpp index 11649c6..0821717 100644 --- a/PowerRecomp/recompiler.cpp +++ b/PowerRecomp/recompiler.cpp @@ -1501,7 +1501,7 @@ bool Recompiler::Recompile( case PPC_INST_STBU: println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U8({}, {}.u8);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u8);", mmioStore() ? "PPC_MM_STORE_U8(" : "PPC_STORE_U8(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[2]), ea()); break; @@ -1514,7 +1514,7 @@ bool Recompiler::Recompile( case PPC_INST_STBUX: println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U8({}, {}.u8);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u8);", mmioStore() ? "PPC_MM_STORE_U8(" : "PPC_STORE_U8(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[1]), ea()); break; @@ -1537,7 +1537,7 @@ bool Recompiler::Recompile( case PPC_INST_STDU: println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U64({}, {}.u64);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u64);", mmioStore() ? "PPC_MM_STORE_U64(" : "PPC_STORE_U64(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[2]), ea()); break; @@ -1550,7 +1550,7 @@ bool Recompiler::Recompile( case PPC_INST_STDUX: println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U64({}, {}.u64);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u64);", mmioStore() ? "PPC_MM_STORE_U64(" : "PPC_STORE_U64(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[1]), ea()); break; @@ -1565,7 +1565,7 @@ bool Recompiler::Recompile( case PPC_INST_STFDU: printSetFlushMode(false); println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U64({}, {}.u64);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u64);", mmioStore() ? "PPC_MM_STORE_U64(" : "PPC_STORE_U64(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[2]), ea()); break; @@ -1598,7 +1598,7 @@ bool Recompiler::Recompile( printSetFlushMode(false); println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0])); println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U32({}, {}.u32);", ea(), temp()); + println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), temp()); println("\t{}.u32 = {};", r(insn.operands[2]), ea()); break; @@ -1615,7 +1615,7 @@ bool Recompiler::Recompile( printSetFlushMode(false); println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0])); println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U32({}, {}.u32);", ea(), temp()); + println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), temp()); println("\t{}.u32 = {};", r(insn.operands[1]), ea()); break; @@ -1628,13 +1628,13 @@ bool Recompiler::Recompile( case PPC_INST_STHU: println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U16({}, {}.u16);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u16);", mmioStore() ? "PPC_MM_STORE_U16(" : "PPC_STORE_U16(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[2]), ea()); break; case PPC_INST_STHUX: println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U16({}, {}.u16);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u16);", mmioStore() ? "PPC_MM_STORE_U16(" : "PPC_STORE_U16(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[1]), ea()); break; @@ -1733,13 +1733,13 @@ bool Recompiler::Recompile( case PPC_INST_STWU: println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U32({}, {}.u32);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[2]), ea()); break; case PPC_INST_STWUX: println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U32({}, {}.u32);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[1]), ea()); break; From 8de0e8ca3088df4c84b68ee484ffa89dc10a8667 Mon Sep 17 00:00:00 2001 From: Isaac Marovitz Date: Mon, 3 Mar 2025 09:12:21 +0000 Subject: [PATCH 09/18] Inital add SIMD-E Signed-off-by: Isaac Marovitz --- .gitmodules | 3 +++ XenonRecomp/pch.h | 5 ++++- XenonUtils/CMakeLists.txt | 3 ++- thirdparty/simde | 1 + 4 files changed, 10 insertions(+), 2 deletions(-) create mode 160000 thirdparty/simde diff --git a/.gitmodules b/.gitmodules index d12d6a2..271d969 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,3 +13,6 @@ [submodule "thirdparty/tiny-AES-c"] path = thirdparty/tiny-AES-c url = https://github.com/kokke/tiny-AES-c.git +[submodule "thirdparty/simde"] + path = thirdparty/simde + url = https://github.com/simd-everywhere/simde-no-tests.git diff --git a/XenonRecomp/pch.h b/XenonRecomp/pch.h index 02e53d9..aaf2e47 100644 --- a/XenonRecomp/pch.h +++ b/XenonRecomp/pch.h @@ -16,4 +16,7 @@ #include #include #include -#include + +# define SIMDE_ENABLE_NATIVE_ALIASES + +#include diff --git a/XenonUtils/CMakeLists.txt b/XenonUtils/CMakeLists.txt index b8ca66f..1807323 100644 --- a/XenonUtils/CMakeLists.txt +++ b/XenonUtils/CMakeLists.txt @@ -17,8 +17,9 @@ target_compile_definitions(XenonUtils ) target_include_directories(XenonUtils - PUBLIC + PUBLIC . + "${THIRDPARTY_ROOT}/simde" PRIVATE "${THIRDPARTY_ROOT}/libmspack/libmspack/mspack" "${THIRDPARTY_ROOT}/tiny-AES-c" diff --git a/thirdparty/simde b/thirdparty/simde new file mode 160000 index 0000000..a532a12 --- /dev/null +++ b/thirdparty/simde @@ -0,0 +1 @@ +Subproject commit a532a12ca9bbdc5e6547eb602e6256b71a5377d4 From 5d500f26e7688eda05ec31b85250d70a3903b4e7 Mon Sep 17 00:00:00 2001 From: Isaac Marovitz Date: Mon, 3 Mar 2025 09:41:41 +0000 Subject: [PATCH 10/18] Include simde in ppc_context.h Signed-off-by: Isaac Marovitz --- XenonRecomp/pch.h | 2 +- XenonUtils/ppc_context.h | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/XenonRecomp/pch.h b/XenonRecomp/pch.h index aaf2e47..0dc455c 100644 --- a/XenonRecomp/pch.h +++ b/XenonRecomp/pch.h @@ -17,6 +17,6 @@ #include #include -# define SIMDE_ENABLE_NATIVE_ALIASES +#define SIMDE_ENABLE_NATIVE_ALIASES #include diff --git a/XenonUtils/ppc_context.h b/XenonUtils/ppc_context.h index bc427c9..17b89fb 100644 --- a/XenonUtils/ppc_context.h +++ b/XenonUtils/ppc_context.h @@ -12,14 +12,9 @@ #include #include -#include - -#ifdef _WIN32 -#include -#else -#include -#include -#endif +#define SIMDE_ENABLE_NATIVE_ALIASES +#include +#include #define PPC_JOIN(x, y) x##y #define PPC_XSTRINGIFY(x) #x From d1d8b9e5975f4418f10e5fab9bd538af009a792e Mon Sep 17 00:00:00 2001 From: Isaac Marovitz Date: Mon, 3 Mar 2025 22:17:31 +0000 Subject: [PATCH 11/18] =?UTF-8?q?Fix=20missing=20defines,=20don=E2=80=99t?= =?UTF-8?q?=20use=20native=20aliases?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Isaac Marovitz --- XenonUtils/ppc_context.h | 109 ++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/XenonUtils/ppc_context.h b/XenonUtils/ppc_context.h index 17b89fb..41a06f5 100644 --- a/XenonUtils/ppc_context.h +++ b/XenonUtils/ppc_context.h @@ -12,10 +12,15 @@ #include #include -#define SIMDE_ENABLE_NATIVE_ALIASES +#include #include #include +// SSE3 constants are missing from simde +#ifndef _MM_DENORMALS_ZERO_MASK +#define _MM_DENORMALS_ZERO_MASK 0x0040 +#endif + #define PPC_JOIN(x, y) x##y #define PPC_XSTRINGIFY(x) #x #define PPC_STRINGIFY(x) PPC_XSTRINGIFY(x) @@ -170,18 +175,18 @@ struct PPCCRRegister eq = !un && (left == right); } - inline void setFromMask(__m128 mask, int imm) noexcept + inline void setFromMask(simde__m128 mask, int imm) noexcept { - int m = _mm_movemask_ps(mask); + int m = simde_mm_movemask_ps(mask); lt = m == imm; // all equal gt = 0; eq = m == 0; // none equal so = 0; } - inline void setFromMask(__m128i mask, int imm) noexcept + inline void setFromMask(simde__m128i mask, int imm) noexcept { - int m = _mm_movemask_epi8(mask); + int m = simde_mm_movemask_epi8(mask); lt = m == imm; // all equal gt = 0; eq = m == 0; // none equal @@ -216,34 +221,34 @@ struct PPCFPSCRRegister { uint32_t csr; - static constexpr size_t GuestToHost[] = { _MM_ROUND_NEAREST, _MM_ROUND_TOWARD_ZERO, _MM_ROUND_UP, _MM_ROUND_DOWN }; + static constexpr size_t GuestToHost[] = { SIMDE_MM_ROUND_NEAREST, SIMDE_MM_ROUND_TOWARD_ZERO, SIMDE_MM_ROUND_UP, SIMDE_MM_ROUND_DOWN }; static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO }; inline uint32_t loadFromHost() noexcept { - csr = _mm_getcsr(); - return HostToGuest[(csr & _MM_ROUND_MASK) >> 13]; + csr = simde_mm_getcsr(); + return HostToGuest[(csr & SIMDE_MM_ROUND_MASK) >> 13]; } inline void storeFromGuest(uint32_t value) noexcept { - csr &= ~_MM_ROUND_MASK; + csr &= ~SIMDE_MM_ROUND_MASK; csr |= GuestToHost[value & PPC_ROUND_MASK]; - _mm_setcsr(csr); + simde_mm_setcsr(csr); } - static constexpr size_t FlushMask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK; + static constexpr size_t FlushMask = SIMDE_MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK; inline void enableFlushModeUnconditional() noexcept { csr |= FlushMask; - _mm_setcsr(csr); + simde_mm_setcsr(csr); } inline void disableFlushModeUnconditional() noexcept { csr &= ~FlushMask; - _mm_setcsr(csr); + simde_mm_setcsr(csr); } inline void enableFlushMode() noexcept @@ -251,7 +256,7 @@ struct PPCFPSCRRegister if ((csr & FlushMask) != FlushMask) [[unlikely]] { csr |= FlushMask; - _mm_setcsr(csr); + simde_mm_setcsr(csr); } } @@ -260,7 +265,7 @@ struct PPCFPSCRRegister if ((csr & FlushMask) != 0) [[unlikely]] { csr &= ~FlushMask; - _mm_setcsr(csr); + simde_mm_setcsr(csr); } } }; @@ -588,68 +593,68 @@ inline uint8_t VectorShiftTableR[] = 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, }; -inline __m128i _mm_adds_epu32(__m128i a, __m128i b) +inline simde__m128i _mm_adds_epu32(simde__m128i a, simde__m128i b) { - return _mm_add_epi32(a, _mm_min_epu32(_mm_xor_si128(a, _mm_cmpeq_epi32(a, a)), b)); + return simde_mm_add_epi32(a, simde_mm_min_epu32(simde_mm_xor_si128(a, simde_mm_cmpeq_epi32(a, a)), b)); } -inline __m128i _mm_avg_epi8(__m128i a, __m128i b) +inline simde__m128i _mm_avg_epi8(simde__m128i a, simde__m128i b) { - __m128i c = _mm_set1_epi8(char(128)); - return _mm_xor_si128(c, _mm_avg_epu8(_mm_xor_si128(c, a), _mm_xor_si128(c, b))); + simde__m128i c = simde_mm_set1_epi8(char(128)); + return simde_mm_xor_si128(c, simde_mm_avg_epu8(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b))); } -inline __m128i _mm_avg_epi16(__m128i a, __m128i b) +inline simde__m128i _mm_avg_epi16(simde__m128i a, simde__m128i b) { - __m128i c = _mm_set1_epi16(short(32768)); - return _mm_xor_si128(c, _mm_avg_epu16(_mm_xor_si128(c, a), _mm_xor_si128(c, b))); + simde__m128i c = simde_mm_set1_epi16(short(32768)); + return simde_mm_xor_si128(c, simde_mm_avg_epu16(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b))); } -inline __m128 _mm_cvtepu32_ps_(__m128i src1) +inline simde__m128 _mm_cvtepu32_ps_(simde__m128i src1) { - __m128i xmm1 = _mm_add_epi32(src1, _mm_set1_epi32(127)); - __m128i xmm0 = _mm_slli_epi32(src1, 31 - 8); - xmm0 = _mm_srli_epi32(xmm0, 31); - xmm0 = _mm_add_epi32(xmm0, xmm1); - xmm0 = _mm_srai_epi32(xmm0, 8); - xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(0x4F800000)); - __m128 xmm2 = _mm_cvtepi32_ps(src1); - return _mm_blendv_ps(xmm2, _mm_castsi128_ps(xmm0), _mm_castsi128_ps(src1)); + simde__m128i xmm1 = simde_mm_add_epi32(src1, simde_mm_set1_epi32(127)); + simde__m128i xmm0 = simde_mm_slli_epi32(src1, 31 - 8); + xmm0 = simde_mm_srli_epi32(xmm0, 31); + xmm0 = simde_mm_add_epi32(xmm0, xmm1); + xmm0 = simde_mm_srai_epi32(xmm0, 8); + xmm0 = simde_mm_add_epi32(xmm0, simde_mm_set1_epi32(0x4F800000)); + simde__m128 xmm2 = simde_mm_cvtepi32_ps(src1); + return simde_mm_blendv_ps(xmm2, simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(src1)); } -inline __m128i _mm_perm_epi8_(__m128i a, __m128i b, __m128i c) +inline simde__m128i _mm_perm_epi8_(simde__m128i a, simde__m128i b, simde__m128i c) { - __m128i d = _mm_set1_epi8(0xF); - __m128i e = _mm_sub_epi8(d, _mm_and_si128(c, d)); - return _mm_blendv_epi8(_mm_shuffle_epi8(a, e), _mm_shuffle_epi8(b, e), _mm_slli_epi32(c, 3)); + simde__m128i d = simde_mm_set1_epi8(0xF); + simde__m128i e = simde_mm_sub_epi8(d, simde_mm_and_si128(c, d)); + return simde_mm_blendv_epi8(simde_mm_shuffle_epi8(a, e), simde_mm_shuffle_epi8(b, e), simde_mm_slli_epi32(c, 3)); } -inline __m128i _mm_cmpgt_epu8(__m128i a, __m128i b) +inline simde__m128i _mm_cmpgt_epu8(simde__m128i a, simde__m128i b) { - __m128i c = _mm_set1_epi8(char(128)); - return _mm_cmpgt_epi8(_mm_xor_si128(a, c), _mm_xor_si128(b, c)); + simde__m128i c = simde_mm_set1_epi8(char(128)); + return simde_mm_cmpgt_epi8(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c)); } -inline __m128i _mm_cmpgt_epu16(__m128i a, __m128i b) +inline simde__m128i _mm_cmpgt_epu16(simde__m128i a, simde__m128i b) { - __m128i c = _mm_set1_epi16(short(32768)); - return _mm_cmpgt_epi16(_mm_xor_si128(a, c), _mm_xor_si128(b, c)); + simde__m128i c = simde_mm_set1_epi16(short(32768)); + return simde_mm_cmpgt_epi16(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c)); } -inline __m128i _mm_vctsxs(__m128 src1) +inline simde__m128i _mm_vctsxs(simde__m128 src1) { - __m128 xmm2 = _mm_cmpunord_ps(src1, src1); - __m128i xmm0 = _mm_cvttps_epi32(src1); - __m128i xmm1 = _mm_cmpeq_epi32(xmm0, _mm_set1_epi32(INT_MIN)); - xmm1 = _mm_andnot_si128(_mm_castps_si128(src1), xmm1); - __m128 dest = _mm_blendv_ps(_mm_castsi128_ps(xmm0), _mm_castsi128_ps(_mm_set1_epi32(INT_MAX)), _mm_castsi128_ps(xmm1)); - return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest)); + simde__m128 xmm2 = simde_mm_cmpunord_ps(src1, src1); + simde__m128i xmm0 = simde_mm_cvttps_epi32(src1); + simde__m128i xmm1 = simde_mm_cmpeq_epi32(xmm0, simde_mm_set1_epi32(INT_MIN)); + xmm1 = simde_mm_andnot_si128(simde_mm_castps_si128(src1), xmm1); + simde__m128 dest = simde_mm_blendv_ps(simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(simde_mm_set1_epi32(INT_MAX)), simde_mm_castsi128_ps(xmm1)); + return simde_mm_andnot_si128(simde_mm_castps_si128(xmm2), simde_mm_castps_si128(dest)); } -inline __m128i _mm_vsr(__m128i a, __m128i b) +inline simde__m128i _mm_vsr(simde__m128i a, simde__m128i b) { - b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61); - return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10)); + b = simde_mm_srli_epi64(simde_mm_slli_epi64(b, 61), 61); + return simde_mm_castps_si128(simde_mm_insert_ps(simde_mm_castsi128_ps(simde_mm_srl_epi64(a, b)), simde_mm_castsi128_ps(simde_mm_srl_epi64(simde_mm_srli_si128(a, 4), b)), 0x10)); } #endif From aad99be74d002b13e6cf06674baa872f851c2533 Mon Sep 17 00:00:00 2001 From: Isaac Marovitz Date: Mon, 3 Mar 2025 22:20:27 +0000 Subject: [PATCH 12/18] Update recompiler Signed-off-by: Isaac Marovitz --- XenonRecomp/pch.h | 3 - XenonRecomp/recompiler.cpp | 178 ++++++++++++++++++------------------- XenonUtils/ppc_context.h | 18 ++-- 3 files changed, 98 insertions(+), 101 deletions(-) diff --git a/XenonRecomp/pch.h b/XenonRecomp/pch.h index 0dc455c..c9f9a85 100644 --- a/XenonRecomp/pch.h +++ b/XenonRecomp/pch.h @@ -16,7 +16,4 @@ #include #include #include - -#define SIMDE_ENABLE_NATIVE_ALIASES - #include diff --git a/XenonRecomp/recompiler.cpp b/XenonRecomp/recompiler.cpp index 5374797..21ac2e3 100644 --- a/XenonRecomp/recompiler.cpp +++ b/XenonRecomp/recompiler.cpp @@ -896,17 +896,17 @@ bool Recompiler::Recompile( case PPC_INST_FCTID: printSetFlushMode(false); - println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : _mm_cvtsd_si64(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); + println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : simde_mm_cvtsd_si64(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); break; case PPC_INST_FCTIDZ: printSetFlushMode(false); - println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : _mm_cvttsd_si64(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); + println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : simde_mm_cvttsd_si64(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); break; case PPC_INST_FCTIWZ: printSetFlushMode(false); - println("\t{}.s64 = ({}.f64 > double(INT_MAX)) ? INT_MAX : _mm_cvttsd_si32(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); + println("\t{}.s64 = ({}.f64 > double(INT_MAX)) ? INT_MAX : simde_mm_cvttsd_si32(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); break; case PPC_INST_FDIV: @@ -1138,10 +1138,10 @@ bool Recompiler::Recompile( case PPC_INST_LVX128: // NOTE: for endian swapping, we reverse the whole vector instead of individual elements. // this is accounted for in every instruction (eg. dp3 sums yzw instead of xyz) - print("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ((", v(insn.operands[0])); + print("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ((", v(insn.operands[0])); if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); - println("{}.u32) & ~0xF))), _mm_load_si128((__m128i*)VectorMaskL)));", r(insn.operands[2])); + println("{}.u32) & ~0xF))), simde_mm_load_si128((simde__m128i*)VectorMaskL)));", r(insn.operands[2])); break; case PPC_INST_LVLX: @@ -1150,7 +1150,7 @@ bool Recompiler::Recompile( if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); println("{}.u32;", r(insn.operands[2])); - println("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ({}.u32 & ~0xF))), _mm_load_si128((__m128i*)&VectorMaskL[({}.u32 & 0xF) * 16])));", v(insn.operands[0]), temp(), temp()); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ({}.u32 & ~0xF))), simde_mm_load_si128((simde__m128i*)&VectorMaskL[({}.u32 & 0xF) * 16])));", v(insn.operands[0]), temp(), temp()); break; case PPC_INST_LVRX: @@ -1159,7 +1159,7 @@ bool Recompiler::Recompile( if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); println("{}.u32;", r(insn.operands[2])); - println("\t_mm_store_si128((__m128i*){}.u8, {}.u32 & 0xF ? _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ({}.u32 & ~0xF))), _mm_load_si128((__m128i*)&VectorMaskR[({}.u32 & 0xF) * 16])) : _mm_setzero_si128());", v(insn.operands[0]), temp(), temp(), temp()); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, {}.u32 & 0xF ? simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ({}.u32 & ~0xF))), simde_mm_load_si128((simde__m128i*)&VectorMaskR[({}.u32 & 0xF) * 16])) : simde_mm_setzero_si128());", v(insn.operands[0]), temp(), temp(), temp()); break; case PPC_INST_LVSL: @@ -1167,7 +1167,7 @@ bool Recompiler::Recompile( if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); println("{}.u32;", r(insn.operands[2])); - println("\t_mm_store_si128((__m128i*){}.u8, _mm_load_si128((__m128i*)&VectorShiftTableL[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp()); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_load_si128((simde__m128i*)&VectorShiftTableL[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp()); break; case PPC_INST_LVSR: @@ -1175,7 +1175,7 @@ bool Recompiler::Recompile( if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); println("{}.u32;", r(insn.operands[2])); - println("\t_mm_store_si128((__m128i*){}.u8, _mm_load_si128((__m128i*)&VectorShiftTableR[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp()); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_load_si128((simde__m128i*)&VectorShiftTableR[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp()); break; case PPC_INST_LWA: @@ -1634,10 +1634,10 @@ bool Recompiler::Recompile( case PPC_INST_STVX: case PPC_INST_STVX128: - print("\t_mm_store_si128((__m128i*)(base + (("); + print("\tsimde_mm_store_si128((simde__m128i*)(base + (("); if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); - println("{}.u32) & ~0xF)), _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*)VectorMaskL)));", r(insn.operands[2]), v(insn.operands[0])); + println("{}.u32) & ~0xF)), simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*)VectorMaskL)));", r(insn.operands[2]), v(insn.operands[0])); break; case PPC_INST_STW: @@ -1736,77 +1736,77 @@ bool Recompiler::Recompile( case PPC_INST_VADDFP: case PPC_INST_VADDFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_add_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_add_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDSHS: - println("\t_mm_store_si128((__m128i*){}.s16, _mm_adds_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s16, simde_mm_adds_epi16(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDUBM: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_add_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_add_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDUBS: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_adds_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_adds_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDUHM: - println("\t_mm_store_si128((__m128i*){}.u16, _mm_add_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_add_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDUWM: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_add_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_add_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDUWS: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_adds_epu32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_adds_epu32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VAND: case PPC_INST_VAND128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_and_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VANDC128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_andnot_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VAVGSB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VAVGSH: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epi16(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epi16(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VAVGUB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VCTSXS: case PPC_INST_VCFPSXWS128: printSetFlushMode(true); - print("\t_mm_store_si128((__m128i*){}.s32, _mm_vctsxs(", v(insn.operands[0])); + print("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_vctsxs(", v(insn.operands[0])); if (insn.operands[2] != 0) - println("_mm_mul_ps(_mm_load_ps({}.f32), _mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]); + println("simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]); else - println("_mm_load_ps({}.f32)));", v(insn.operands[1])); + println("simde_mm_load_ps({}.f32)));", v(insn.operands[1])); break; case PPC_INST_VCFSX: case PPC_INST_VCSXWFP128: { printSetFlushMode(true); - print("\t_mm_store_ps({}.f32, ", v(insn.operands[0])); + print("\tsimde_mm_store_ps({}.f32, ", v(insn.operands[0])); if (insn.operands[2] != 0) { const float value = ldexp(1.0f, -int32_t(insn.operands[2])); - println("_mm_mul_ps(_mm_cvtepi32_ps(_mm_load_si128((__m128i*){}.u32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast(&value)); + println("simde_mm_mul_ps(simde_mm_cvtepi32_ps(simde_mm_load_si128((simde__m128i*){}.u32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast(&value)); } else { - println("_mm_cvtepi32_ps(_mm_load_si128((__m128i*){}.u32)));", v(insn.operands[1])); + println("simde_mm_cvtepi32_ps(simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[1])); } break; } @@ -1815,15 +1815,15 @@ bool Recompiler::Recompile( case PPC_INST_VCUXWFP128: { printSetFlushMode(true); - print("\t_mm_store_ps({}.f32, ", v(insn.operands[0])); + print("\tsimde_mm_store_ps({}.f32, ", v(insn.operands[0])); if (insn.operands[2] != 0) { const float value = ldexp(1.0f, -int32_t(insn.operands[2])); - println("_mm_mul_ps(_mm_cvtepu32_ps_(_mm_load_si128((__m128i*){}.u32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast(&value)); + println("simde_mm_mul_ps(simde_mm_cvtepu32_ps_(simde_mm_load_si128((simde__m128i*){}.u32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast(&value)); } else { - println("_mm_cvtepu32_ps_(_mm_load_si128((__m128i*){}.u32)));", v(insn.operands[1])); + println("simde_mm_cvtepu32_ps_(simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[1])); } break; } @@ -1836,46 +1836,46 @@ bool Recompiler::Recompile( case PPC_INST_VCMPEQFP: case PPC_INST_VCMPEQFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_cmpeq_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpeq_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPEQUB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpeq_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_si128((simde__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPEQUW: case PPC_INST_VCMPEQUW128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpeq_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGEFP: case PPC_INST_VCMPGEFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_cmpge_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpge_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGTFP: case PPC_INST_VCMPGTFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_cmpgt_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpgt_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGTUB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpgt_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VCMPGTUH: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpgt_epu16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VEXPTEFP: @@ -1898,87 +1898,87 @@ bool Recompiler::Recompile( case PPC_INST_VMADDFP: case PPC_INST_VMADDFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_add_ps(_mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_add_ps(simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); break; case PPC_INST_VMAXFP: case PPC_INST_VMAXFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_max_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_max_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMAXSW: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_max_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_max_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMINFP: case PPC_INST_VMINFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_min_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_min_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMRGHB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_unpackhi_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_unpackhi_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMRGHH: - println("\t_mm_store_si128((__m128i*){}.u16, _mm_unpackhi_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_unpackhi_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMRGHW: case PPC_INST_VMRGHW128: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_unpackhi_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_unpackhi_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMRGLB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_unpacklo_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_unpacklo_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMRGLH: - println("\t_mm_store_si128((__m128i*){}.u16, _mm_unpacklo_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_unpacklo_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMRGLW: case PPC_INST_VMRGLW128: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_unpacklo_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_unpacklo_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMSUM3FP128: // NOTE: accounting for full vector reversal here. should dot product yzw instead of xyz printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_dp_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32), 0xEF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_dp_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32), 0xEF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMSUM4FP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_dp_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32), 0xFF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_dp_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32), 0xFF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMULFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VNMSUBFP: case PPC_INST_VNMSUBFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_xor_ps(_mm_sub_ps(_mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)), _mm_load_ps({}.f32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x80000000)))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_xor_ps(simde_mm_sub_ps(simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)), simde_mm_load_ps({}.f32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x80000000)))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); break; case PPC_INST_VOR: case PPC_INST_VOR128: - print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0])); + print("\tsimde_mm_store_si128((simde__m128i*){}.u8, ", v(insn.operands[0])); if (insn.operands[1] != insn.operands[2]) - println("_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2])); + println("simde_mm_or_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2])); else - println("_mm_load_si128((__m128i*){}.u8));", v(insn.operands[1])); + println("simde_mm_load_si128((simde__m128i*){}.u8));", v(insn.operands[1])); break; case PPC_INST_VPERM: case PPC_INST_VPERM128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_perm_epi8_(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_perm_epi8_(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); break; case PPC_INST_VPERMWI128: @@ -1989,7 +1989,7 @@ bool Recompiler::Recompile( uint32_t z = 3 - ((insn.operands[2] >> 4) & 0x3); uint32_t w = 3 - ((insn.operands[2] >> 6) & 0x3); uint32_t perm = x | (y << 2) | (z << 4) | (w << 6); - println("\t_mm_store_si128((__m128i*){}.u32, _mm_shuffle_epi32(_mm_load_si128((__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_shuffle_epi32(simde_mm_load_si128((simde__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm); break; } @@ -2021,38 +2021,38 @@ bool Recompiler::Recompile( case PPC_INST_VPKSHUS: case PPC_INST_VPKSHUS128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_packus_epi16(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VREFP: case PPC_INST_VREFP128: // TODO: see if we can use rcp safely printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_div_ps(_mm_set1_ps(1), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_div_ps(simde_mm_set1_ps(1), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VRFIM: case PPC_INST_VRFIM128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VRFIN: case PPC_INST_VRFIN128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VRFIZ: case PPC_INST_VRFIZ128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VRLIMI128: { - constexpr size_t shuffles[] = { _MM_SHUFFLE(3, 2, 1, 0), _MM_SHUFFLE(2, 1, 0, 3), _MM_SHUFFLE(1, 0, 3, 2), _MM_SHUFFLE(0, 3, 2, 1) }; - println("\t_mm_store_ps({}.f32, _mm_blend_ps(_mm_load_ps({}.f32), _mm_permute_ps(_mm_load_ps({}.f32), {}), {}));", v(insn.operands[0]), v(insn.operands[0]), v(insn.operands[1]), shuffles[insn.operands[3]], insn.operands[2]); + constexpr size_t shuffles[] = { SIMDE_MM_SHUFFLE(3, 2, 1, 0), SIMDE_MM_SHUFFLE(2, 1, 0, 3), SIMDE_MM_SHUFFLE(1, 0, 3, 2), SIMDE_MM_SHUFFLE(0, 3, 2, 1) }; + println("\tsimde_mm_store_ps({}.f32, simde_mm_blend_ps(simde_mm_load_ps({}.f32), simde_mm_permute_ps(simde_mm_load_ps({}.f32), {}), {}));", v(insn.operands[0]), v(insn.operands[0]), v(insn.operands[1]), shuffles[insn.operands[3]], insn.operands[2]); break; } @@ -2061,11 +2061,11 @@ bool Recompiler::Recompile( // TODO: see if we can use rsqrt safely // TODO: we can detect if the input is from a dot product and apply logic only on one value printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_div_ps(_mm_set1_ps(1), _mm_sqrt_ps(_mm_load_ps({}.f32))));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_div_ps(simde_mm_set1_ps(1), simde_mm_sqrt_ps(simde_mm_load_ps({}.f32))));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VSEL: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_or_si128(_mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_or_si128(simde_mm_andnot_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)), simde_mm_and_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2])); break; case PPC_INST_VSLB: @@ -2076,7 +2076,7 @@ bool Recompiler::Recompile( case PPC_INST_VSLDOI: case PPC_INST_VSLDOI128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_alignr_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_alignr_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]); break; case PPC_INST_VSLW: @@ -2090,7 +2090,7 @@ bool Recompiler::Recompile( { // NOTE: accounting for full vector reversal here uint32_t perm = 15 - insn.operands[2]; - println("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u8), _mm_set1_epi8(char(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_set1_epi8(char(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm); break; } @@ -2099,17 +2099,17 @@ bool Recompiler::Recompile( // NOTE: accounting for full vector reversal here uint32_t perm = 7 - insn.operands[2]; perm = (perm * 2) | ((perm * 2 + 1) << 8); - println("\t_mm_store_si128((__m128i*){}.u16, _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u16), _mm_set1_epi16(short(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm); + println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_set1_epi16(short(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm); break; } case PPC_INST_VSPLTISB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]); break; case PPC_INST_VSPLTISW: case PPC_INST_VSPLTISW128: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); break; case PPC_INST_VSPLTW: @@ -2118,12 +2118,12 @@ bool Recompiler::Recompile( // NOTE: accounting for full vector reversal here uint32_t perm = 3 - insn.operands[2]; perm |= (perm << 2) | (perm << 4) | (perm << 6); - println("\t_mm_store_si128((__m128i*){}.u32, _mm_shuffle_epi32(_mm_load_si128((__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_shuffle_epi32(simde_mm_load_si128((simde__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm); break; } case PPC_INST_VSR: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_vsr(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_vsr(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VSRAW: @@ -2143,7 +2143,7 @@ bool Recompiler::Recompile( case PPC_INST_VSUBFP: case PPC_INST_VSUBFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_sub_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_sub_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VSUBSWS: @@ -2156,11 +2156,11 @@ bool Recompiler::Recompile( break; case PPC_INST_VSUBUBS: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_subs_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_subs_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VSUBUHM: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi16(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_sub_epi16(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VUPKD3D128: @@ -2197,32 +2197,32 @@ bool Recompiler::Recompile( case PPC_INST_VUPKHSB: case PPC_INST_VUPKHSB128: - println("\t_mm_store_si128((__m128i*){}.s16, _mm_cvtepi8_epi16(_mm_unpackhi_epi64(_mm_load_si128((__m128i*){}.s8), _mm_load_si128((__m128i*){}.s8))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s16, simde_mm_cvtepi8_epi16(simde_mm_unpackhi_epi64(simde_mm_load_si128((simde__m128i*){}.s8), simde_mm_load_si128((simde__m128i*){}.s8))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1])); break; case PPC_INST_VUPKHSH: case PPC_INST_VUPKHSH128: - println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi16_epi32(_mm_unpackhi_epi64(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi16_epi32(simde_mm_unpackhi_epi64(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1])); break; case PPC_INST_VUPKLSB: case PPC_INST_VUPKLSB128: - println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi8_epi16(_mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi8_epi16(simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VUPKLSH: case PPC_INST_VUPKLSH128: - println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi16_epi32(_mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi16_epi32(simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VXOR: case PPC_INST_VXOR128: - print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0])); + print("\tsimde_mm_store_si128((simde__m128i*){}.u8, ", v(insn.operands[0])); if (insn.operands[1] != insn.operands[2]) - println("_mm_xor_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2])); + println("simde_mm_xor_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2])); else - println("_mm_setzero_si128());"); + println("simde_mm_setzero_si128());"); break; diff --git a/XenonUtils/ppc_context.h b/XenonUtils/ppc_context.h index 41a06f5..6df9fcc 100644 --- a/XenonUtils/ppc_context.h +++ b/XenonUtils/ppc_context.h @@ -593,24 +593,24 @@ inline uint8_t VectorShiftTableR[] = 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, }; -inline simde__m128i _mm_adds_epu32(simde__m128i a, simde__m128i b) +inline simde__m128i simde_mm_adds_epu32(simde__m128i a, simde__m128i b) { return simde_mm_add_epi32(a, simde_mm_min_epu32(simde_mm_xor_si128(a, simde_mm_cmpeq_epi32(a, a)), b)); } -inline simde__m128i _mm_avg_epi8(simde__m128i a, simde__m128i b) +inline simde__m128i simde_mm_avg_epi8(simde__m128i a, simde__m128i b) { simde__m128i c = simde_mm_set1_epi8(char(128)); return simde_mm_xor_si128(c, simde_mm_avg_epu8(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b))); } -inline simde__m128i _mm_avg_epi16(simde__m128i a, simde__m128i b) +inline simde__m128i simde_mm_avg_epi16(simde__m128i a, simde__m128i b) { simde__m128i c = simde_mm_set1_epi16(short(32768)); return simde_mm_xor_si128(c, simde_mm_avg_epu16(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b))); } -inline simde__m128 _mm_cvtepu32_ps_(simde__m128i src1) +inline simde__m128 simde_mm_cvtepu32_ps_(simde__m128i src1) { simde__m128i xmm1 = simde_mm_add_epi32(src1, simde_mm_set1_epi32(127)); simde__m128i xmm0 = simde_mm_slli_epi32(src1, 31 - 8); @@ -622,26 +622,26 @@ inline simde__m128 _mm_cvtepu32_ps_(simde__m128i src1) return simde_mm_blendv_ps(xmm2, simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(src1)); } -inline simde__m128i _mm_perm_epi8_(simde__m128i a, simde__m128i b, simde__m128i c) +inline simde__m128i simde_mm_perm_epi8_(simde__m128i a, simde__m128i b, simde__m128i c) { simde__m128i d = simde_mm_set1_epi8(0xF); simde__m128i e = simde_mm_sub_epi8(d, simde_mm_and_si128(c, d)); return simde_mm_blendv_epi8(simde_mm_shuffle_epi8(a, e), simde_mm_shuffle_epi8(b, e), simde_mm_slli_epi32(c, 3)); } -inline simde__m128i _mm_cmpgt_epu8(simde__m128i a, simde__m128i b) +inline simde__m128i simde_mm_cmpgt_epu8(simde__m128i a, simde__m128i b) { simde__m128i c = simde_mm_set1_epi8(char(128)); return simde_mm_cmpgt_epi8(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c)); } -inline simde__m128i _mm_cmpgt_epu16(simde__m128i a, simde__m128i b) +inline simde__m128i simde_mm_cmpgt_epu16(simde__m128i a, simde__m128i b) { simde__m128i c = simde_mm_set1_epi16(short(32768)); return simde_mm_cmpgt_epi16(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c)); } -inline simde__m128i _mm_vctsxs(simde__m128 src1) +inline simde__m128i simde_mm_vctsxs(simde__m128 src1) { simde__m128 xmm2 = simde_mm_cmpunord_ps(src1, src1); simde__m128i xmm0 = simde_mm_cvttps_epi32(src1); @@ -651,7 +651,7 @@ inline simde__m128i _mm_vctsxs(simde__m128 src1) return simde_mm_andnot_si128(simde_mm_castps_si128(xmm2), simde_mm_castps_si128(dest)); } -inline simde__m128i _mm_vsr(simde__m128i a, simde__m128i b) +inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b) { b = simde_mm_srli_epi64(simde_mm_slli_epi64(b, 61), 61); return simde_mm_castps_si128(simde_mm_insert_ps(simde_mm_castsi128_ps(simde_mm_srl_epi64(a, b)), simde_mm_castsi128_ps(simde_mm_srl_epi64(simde_mm_srli_si128(a, 4), b)), 0x10)); From 60e421da462b290f26731b08371ba4307bf3f6c6 Mon Sep 17 00:00:00 2001 From: Isaac Marovitz Date: Mon, 3 Mar 2025 22:21:32 +0000 Subject: [PATCH 13/18] Correct fround constants Signed-off-by: Isaac Marovitz --- XenonRecomp/recompiler.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/XenonRecomp/recompiler.cpp b/XenonRecomp/recompiler.cpp index 21ac2e3..fcae3c2 100644 --- a/XenonRecomp/recompiler.cpp +++ b/XenonRecomp/recompiler.cpp @@ -2034,19 +2034,19 @@ bool Recompiler::Recompile( case PPC_INST_VRFIM: case PPC_INST_VRFIM128: printSetFlushMode(true); - println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VRFIN: case PPC_INST_VRFIN128: printSetFlushMode(true); - println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VRFIZ: case PPC_INST_VRFIZ128: printSetFlushMode(true); - println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VRLIMI128: From 8237abe22e6ed222b0afb984c2e56617d049c54b Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Mon, 3 Mar 2025 17:37:59 -0800 Subject: [PATCH 14/18] Implement __rdtsc and FPCSR for ARM64. --- XenonUtils/ppc_context.h | 71 +++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 11 deletions(-) diff --git a/XenonUtils/ppc_context.h b/XenonUtils/ppc_context.h index 6df9fcc..7b6034a 100644 --- a/XenonUtils/ppc_context.h +++ b/XenonUtils/ppc_context.h @@ -221,34 +221,71 @@ struct PPCFPSCRRegister { uint32_t csr; - static constexpr size_t GuestToHost[] = { SIMDE_MM_ROUND_NEAREST, SIMDE_MM_ROUND_TOWARD_ZERO, SIMDE_MM_ROUND_UP, SIMDE_MM_ROUND_DOWN }; static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO }; + // simde does not handle denormal flags, so we need to implement per-arch. +#if defined(__x86_64__) || defined(_M_X64) + static constexpr size_t RoundShift = 13; + static constexpr size_t RoundMask = SIMDE_MM_ROUND_MASK; + static constexpr size_t FlushMask = SIMDE_MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK; + static constexpr size_t GuestToHost[] = { SIMDE_MM_ROUND_NEAREST, SIMDE_MM_ROUND_TOWARD_ZERO, SIMDE_MM_ROUND_UP, SIMDE_MM_ROUND_DOWN }; + + inline uint32_t getcsr() noexcept + { + return simde_mm_getcsr(); + } + + inline void setcsr(uint32_t csr) noexcept + { + simde_mm_setcsr(csr); + } +#elif defined(__aarch64__) || defined(_M_ARM64) + // RMode + static constexpr size_t RoundShift = 22; + static constexpr size_t RoundMask = 3 << RoundShift; + // FZ and FZ16 + static constexpr size_t FlushMask = (1 << 19) | (1 << 24); + // Nearest, Zero, -Infinity, -Infinity + static constexpr size_t GuestToHost[] = { 0 << RoundShift, 3 << RoundShift, 1 << RoundShift, 2 << RoundShift }; + + inline uint32_t getcsr() noexcept + { + uint64_t csr; + __asm__ __volatile__("mrs %0, fpcr" : "=r"(csr)); + return csr; + } + + inline void setcsr(uint32_t csr) noexcept + { + __asm__ __volatile__("msr fpcr, %0" : : "r"(csr)); + } +#else +# error "Missing implementation for FPSCR." +#endif + inline uint32_t loadFromHost() noexcept { - csr = simde_mm_getcsr(); - return HostToGuest[(csr & SIMDE_MM_ROUND_MASK) >> 13]; + csr = getcsr(); + return HostToGuest[(csr & RoundMask) >> RoundShift]; } inline void storeFromGuest(uint32_t value) noexcept { - csr &= ~SIMDE_MM_ROUND_MASK; + csr &= ~RoundMask; csr |= GuestToHost[value & PPC_ROUND_MASK]; - simde_mm_setcsr(csr); + setcsr(csr); } - static constexpr size_t FlushMask = SIMDE_MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK; - inline void enableFlushModeUnconditional() noexcept { csr |= FlushMask; - simde_mm_setcsr(csr); + setcsr(csr); } inline void disableFlushModeUnconditional() noexcept { csr &= ~FlushMask; - simde_mm_setcsr(csr); + setcsr(csr); } inline void enableFlushMode() noexcept @@ -256,7 +293,7 @@ struct PPCFPSCRRegister if ((csr & FlushMask) != FlushMask) [[unlikely]] { csr |= FlushMask; - simde_mm_setcsr(csr); + setcsr(csr); } } @@ -265,7 +302,7 @@ struct PPCFPSCRRegister if ((csr & FlushMask) != 0) [[unlikely]] { csr &= ~FlushMask; - simde_mm_setcsr(csr); + setcsr(csr); } } }; @@ -657,4 +694,16 @@ inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b) return simde_mm_castps_si128(simde_mm_insert_ps(simde_mm_castsi128_ps(simde_mm_srl_epi64(a, b)), simde_mm_castsi128_ps(simde_mm_srl_epi64(simde_mm_srli_si128(a, 4), b)), 0x10)); } +#if defined(__aarch64__) || defined(_M_ARM64) +inline uint64_t __rdtsc() +{ + uint64_t ret; + asm volatile("mrs %0, cntvct_el0\n\t" + : "=r"(ret)::"memory"); + return ret; +} +#elif !defined(__x86_64__) && !defined(_M_X64) +# error "Missing implementation for __rdtsc()" +#endif + #endif From 9ceefb2c1712094e4d17d5d42fedaa3341f231d6 Mon Sep 17 00:00:00 2001 From: Ultra TM Date: Wed, 5 Mar 2025 19:53:39 +0100 Subject: [PATCH 15/18] Add certain instructions for NG2 --- XenonRecomp/recompiler.cpp | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/XenonRecomp/recompiler.cpp b/XenonRecomp/recompiler.cpp index 829a7c2..f91b67f 100644 --- a/XenonRecomp/recompiler.cpp +++ b/XenonRecomp/recompiler.cpp @@ -1228,7 +1228,9 @@ bool Recompiler::Recompile( case PPC_INST_LVEWX: case PPC_INST_LVEWX128: case PPC_INST_LVX: + case PPC_INST_LVXL: case PPC_INST_LVX128: + case PPC_INST_LVXL128: case PPC_INST_LVEHX: // NOTE: for endian swapping, we reverse the whole vector instead of individual elements. // this is accounted for in every instruction (eg. dp3 sums yzw instead of xyz) @@ -1400,8 +1402,22 @@ bool Recompiler::Recompile( println("\t{}.ca = ({}.u64 & 0x20000000) != 0;", xer(), r(insn.operands[0])); break; + case PPC_INST_MULHD: + println("\t{}.u64 = (int64_t({}.s64) * int64_t({}.s64)) >> 64;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s64, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + + case PPC_INST_MULHDU: + println("\t{}.u64 = (uint64_t({}.u64) * uint64_t({}.u64)) >> 64;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s64, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_MULHW: println("\t{}.s64 = (int64_t({}.s32) * int64_t({}.s32)) >> 32;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; case PPC_INST_MULHWU: @@ -1754,7 +1770,9 @@ bool Recompiler::Recompile( break; case PPC_INST_STVLX: + case PPC_INST_STVLXL: case PPC_INST_STVLX128: + case PPC_INST_STVLXL128: // TODO: vectorize // NOTE: accounting for the full vector reversal here print("\t{} = ", ea()); @@ -1767,7 +1785,9 @@ bool Recompiler::Recompile( break; case PPC_INST_STVRX: + case PPC_INST_STVRXL: case PPC_INST_STVRX128: + case PPC_INST_STVRXL128: // TODO: vectorize // NOTE: accounting for the full vector reversal here print("\t{} = ", ea()); @@ -1936,6 +1956,7 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VANDC: case PPC_INST_VANDC128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; @@ -2177,6 +2198,16 @@ bool Recompiler::Recompile( println("\t_mm_store_ps({}.f32, _mm_xor_ps(_mm_sub_ps(_mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)), _mm_load_ps({}.f32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x80000000)))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); break; + case PPC_INST_VNOR: + print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0])); + + if (insn.operands[1] != insn.operands[2]) + println("_mm_andnot_si128(_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_set1_epi8(0xFF));", v(insn.operands[1]), v(insn.operands[2])); + else + println("_mm_setzero_si128());"); + + break; + case PPC_INST_VOR: case PPC_INST_VOR128: print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0])); From f40eef40b848c07a4a2628ff08f47f2fab609499 Mon Sep 17 00:00:00 2001 From: Ultra TM Date: Wed, 5 Mar 2025 20:31:05 +0100 Subject: [PATCH 16/18] Fix missing paranthesis --- XenonRecomp/recompiler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/XenonRecomp/recompiler.cpp b/XenonRecomp/recompiler.cpp index f91b67f..8a3a743 100644 --- a/XenonRecomp/recompiler.cpp +++ b/XenonRecomp/recompiler.cpp @@ -2202,7 +2202,7 @@ bool Recompiler::Recompile( print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0])); if (insn.operands[1] != insn.operands[2]) - println("_mm_andnot_si128(_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_set1_epi8(0xFF));", v(insn.operands[1]), v(insn.operands[2])); + println("_mm_andnot_si128(_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_set1_epi8(0xFF)));", v(insn.operands[1]), v(insn.operands[2])); else println("_mm_setzero_si128());"); From bbbd77a632e8653991a223fd1aa51f3d11bb0bec Mon Sep 17 00:00:00 2001 From: Ultra TM Date: Thu, 6 Mar 2025 00:18:26 +0100 Subject: [PATCH 17/18] Make all vector intrinsics use simde --- XenonRecomp/recompiler.cpp | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/XenonRecomp/recompiler.cpp b/XenonRecomp/recompiler.cpp index 39dc0e7..854fddc 100644 --- a/XenonRecomp/recompiler.cpp +++ b/XenonRecomp/recompiler.cpp @@ -1939,7 +1939,7 @@ bool Recompiler::Recompile( break; case PPC_INST_VADDSBS: - println("\t_mm_store_si128((__m128i*){}.s8, _mm_adds_epi8(_mm_load_si128((__m128i*){}.s8), _mm_load_si128((__m128i*){}.s8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s8, simde_mm_adds_epi8(simde_mm_load_si128((simde__m128i*){}.s8), simde_mm_load_si128((simde__m128i*){}.s8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDSHS: @@ -1998,7 +1998,7 @@ bool Recompiler::Recompile( break; case PPC_INST_VAVGUH: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epu16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VCTSXS: @@ -2014,11 +2014,11 @@ bool Recompiler::Recompile( case PPC_INST_VCTUXS: case PPC_INST_VCFPUXWS128: printSetFlushMode(true); - print("\t_mm_store_si128((__m128i*){}.u32, _mm_vctuxs(", v(insn.operands[0])); + print("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_vctuxs(", v(insn.operands[0])); if (insn.operands[2] != 0) - println("_mm_mul_ps(_mm_load_ps({}.f32), _mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]); + println("simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]); else - println("_mm_load_ps({}.f32)));", v(insn.operands[1])); + println("simde_mm_load_ps({}.f32)));", v(insn.operands[1])); break; case PPC_INST_VCFSX: @@ -2075,9 +2075,9 @@ bool Recompiler::Recompile( break; case PPC_INST_VCMPEQUH: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpeq_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_si128((simde__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPEQUW: @@ -2106,25 +2106,25 @@ bool Recompiler::Recompile( case PPC_INST_VCMPGTUB: println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpgt_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_si128((simde__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGTUH: println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpgt_epu16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_si128((simde__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGTSH: - println("\t_mm_store_si128((__m128i*){}.s8, _mm_cmpgt_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s8, simde_mm_cmpgt_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.s16), 0xFFFF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_si128((simde__m128i*){}.s16), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGTSW: - println("\t_mm_store_si128((__m128i*){}.s8, _mm_cmpgt_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s8, simde_mm_cmpgt_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.s32), 0xFFFF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_si128((simde__m128i*){}.s32), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VEXPTEFP: @@ -2157,7 +2157,7 @@ bool Recompiler::Recompile( break; case PPC_INST_VMAXSH: - println("\t_mm_store_si128((__m128i*){}.u16, _mm_max_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_max_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMAXSW: @@ -2165,7 +2165,7 @@ bool Recompiler::Recompile( break; case PPC_INST_VMINSH: - println("\t_mm_store_si128((__m128i*){}.u16, _mm_min_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_min_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMINFP: @@ -2223,12 +2223,12 @@ bool Recompiler::Recompile( break; case PPC_INST_VNOR: - print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0])); + print("\tsimde_mm_store_si128((simde__m128i*){}.u8, ", v(insn.operands[0])); if (insn.operands[1] != insn.operands[2]) - println("_mm_andnot_si128(_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_set1_epi8(0xFF)));", v(insn.operands[1]), v(insn.operands[2])); + println("simde_mm_andnot_si128(simde_mm_or_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)), simde_mm_set1_epi8(0xFF)));", v(insn.operands[1]), v(insn.operands[2])); else - println("_mm_setzero_si128());"); + println("simde_mm_setzero_si128());"); break; @@ -2288,12 +2288,12 @@ bool Recompiler::Recompile( case PPC_INST_VPKSHSS: case PPC_INST_VPKSHSS128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_packs_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_packs_epi16(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VPKSWSS: case PPC_INST_VPKSWSS128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_packs_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_packs_epi32(simde_mm_load_si128((simde__m128i*){}.s32), simde_mm_load_si128((simde__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VPKSHUS: @@ -2303,7 +2303,7 @@ bool Recompiler::Recompile( case PPC_INST_VPKSWUS: case PPC_INST_VPKSWUS128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_packus_epi32(simde_mm_load_si128((simde__m128i*){}.s32), simde_mm_load_si128((simde__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VPKUHUS: @@ -2415,7 +2415,7 @@ bool Recompiler::Recompile( break; case PPC_INST_VSPLTISH: - println("\t_mm_store_si128((__m128i*){}.u16, _mm_set1_epi16(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); + println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_set1_epi16(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); break; case PPC_INST_VSPLTISW: @@ -2498,11 +2498,11 @@ bool Recompiler::Recompile( break; case PPC_INST_VSUBUBM: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_sub_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VSUBUHM: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_sub_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VUPKD3D128: From 180084ca55128593dc1d77e4f325e0990f45eb9f Mon Sep 17 00:00:00 2001 From: Ultra TM Date: Thu, 6 Mar 2025 19:50:53 +0100 Subject: [PATCH 18/18] Fix obsolete function header without body --- XenonUtils/ppc_context.h | 1 - 1 file changed, 1 deletion(-) diff --git a/XenonUtils/ppc_context.h b/XenonUtils/ppc_context.h index 6996f7c..052ddd2 100644 --- a/XenonUtils/ppc_context.h +++ b/XenonUtils/ppc_context.h @@ -707,7 +707,6 @@ inline simde__m128i simde_mm_vctuxs(simde__m128 src1) return simde_mm_or_si128(dest, xmm0); } -inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b) #if defined(__aarch64__) || defined(_M_ARM64) inline uint64_t __rdtsc() {