diff --git a/.gitmodules b/.gitmodules index d12d6a2..271d969 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,3 +13,6 @@ [submodule "thirdparty/tiny-AES-c"] path = thirdparty/tiny-AES-c url = https://github.com/kokke/tiny-AES-c.git +[submodule "thirdparty/simde"] + path = thirdparty/simde + url = https://github.com/simd-everywhere/simde-no-tests.git diff --git a/XenonRecomp/CMakeLists.txt b/XenonRecomp/CMakeLists.txt index 623a06f..f5db6d1 100644 --- a/XenonRecomp/CMakeLists.txt +++ b/XenonRecomp/CMakeLists.txt @@ -19,6 +19,11 @@ target_link_libraries(XenonRecomp PRIVATE if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") target_compile_options(XenonRecomp PRIVATE -Wno-switch -Wno-unused-variable -Wno-null-arithmetic) + + # alias attribute not supported on Apple. + if (NOT APPLE) + target_compile_definitions(XenonRecomp PRIVATE XENON_RECOMP_USE_ALIAS) + endif() endif() target_compile_definitions(XenonRecomp PRIVATE _CRT_SECURE_NO_WARNINGS) diff --git a/XenonRecomp/pch.h b/XenonRecomp/pch.h index 02e53d9..c9f9a85 100644 --- a/XenonRecomp/pch.h +++ b/XenonRecomp/pch.h @@ -16,4 +16,4 @@ #include #include #include -#include +#include diff --git a/XenonRecomp/recompiler.cpp b/XenonRecomp/recompiler.cpp index 8a3a743..39dc0e7 100644 --- a/XenonRecomp/recompiler.cpp +++ b/XenonRecomp/recompiler.cpp @@ -99,48 +99,72 @@ void Recompiler::Analyse() { if (i < 32) { - auto& restgpr = functions.emplace_back(); - restgpr.base = config.restGpr14Address + (i - 14) * 4; - restgpr.size = (32 - i) * 4 + 12; - image.symbols.emplace(Symbol{ fmt::format("__restgprlr_{}", i), restgpr.base, restgpr.size, Symbol_Function }); + if (config.restGpr14Address != 0) + { + auto& restgpr = functions.emplace_back(); + restgpr.base = config.restGpr14Address + (i - 14) * 4; + restgpr.size = (32 - i) * 4 + 12; + image.symbols.emplace(Symbol{ fmt::format("__restgprlr_{}", i), restgpr.base, restgpr.size, Symbol_Function }); + } - auto& savegpr = functions.emplace_back(); - savegpr.base = config.saveGpr14Address + (i - 14) * 4; - savegpr.size = (32 - i) * 4 + 8; - image.symbols.emplace(fmt::format("__savegprlr_{}", i), savegpr.base, savegpr.size, Symbol_Function); + if (config.saveGpr14Address != 0) + { + auto& savegpr = functions.emplace_back(); + savegpr.base = config.saveGpr14Address + (i - 14) * 4; + savegpr.size = (32 - i) * 4 + 8; + image.symbols.emplace(fmt::format("__savegprlr_{}", i), savegpr.base, savegpr.size, Symbol_Function); + } - auto& restfpr = functions.emplace_back(); - restfpr.base = config.restFpr14Address + (i - 14) * 4; - restfpr.size = (32 - i) * 4 + 4; - image.symbols.emplace(fmt::format("__restfpr_{}", i), restfpr.base, restfpr.size, Symbol_Function); + if (config.restFpr14Address != 0) + { + auto& restfpr = functions.emplace_back(); + restfpr.base = config.restFpr14Address + (i - 14) * 4; + restfpr.size = (32 - i) * 4 + 4; + image.symbols.emplace(fmt::format("__restfpr_{}", i), restfpr.base, restfpr.size, Symbol_Function); + } - auto& savefpr = functions.emplace_back(); - savefpr.base = config.saveFpr14Address + (i - 14) * 4; - savefpr.size = (32 - i) * 4 + 4; - image.symbols.emplace(fmt::format("__savefpr_{}", i), savefpr.base, savefpr.size, Symbol_Function); + if (config.saveFpr14Address != 0) + { + auto& savefpr = functions.emplace_back(); + savefpr.base = config.saveFpr14Address + (i - 14) * 4; + savefpr.size = (32 - i) * 4 + 4; + image.symbols.emplace(fmt::format("__savefpr_{}", i), savefpr.base, savefpr.size, Symbol_Function); + } - auto& restvmx = functions.emplace_back(); - restvmx.base = config.restVmx14Address + (i - 14) * 8; - restvmx.size = (32 - i) * 8 + 4; - image.symbols.emplace(fmt::format("__restvmx_{}", i), restvmx.base, restvmx.size, Symbol_Function); + if (config.restVmx14Address != 0) + { + auto& restvmx = functions.emplace_back(); + restvmx.base = config.restVmx14Address + (i - 14) * 8; + restvmx.size = (32 - i) * 8 + 4; + image.symbols.emplace(fmt::format("__restvmx_{}", i), restvmx.base, restvmx.size, Symbol_Function); + } - auto& savevmx = functions.emplace_back(); - savevmx.base = config.saveVmx14Address + (i - 14) * 8; - savevmx.size = (32 - i) * 8 + 4; - image.symbols.emplace(fmt::format("__savevmx_{}", i), savevmx.base, savevmx.size, Symbol_Function); + if (config.saveVmx14Address != 0) + { + auto& savevmx = functions.emplace_back(); + savevmx.base = config.saveVmx14Address + (i - 14) * 8; + savevmx.size = (32 - i) * 8 + 4; + image.symbols.emplace(fmt::format("__savevmx_{}", i), savevmx.base, savevmx.size, Symbol_Function); + } } if (i >= 64) { - auto& restvmx = functions.emplace_back(); - restvmx.base = config.restVmx64Address + (i - 64) * 8; - restvmx.size = (128 - i) * 8 + 4; - image.symbols.emplace(fmt::format("__restvmx_{}", i), restvmx.base, restvmx.size, Symbol_Function); + if (config.restVmx64Address != 0) + { + auto& restvmx = functions.emplace_back(); + restvmx.base = config.restVmx64Address + (i - 64) * 8; + restvmx.size = (128 - i) * 8 + 4; + image.symbols.emplace(fmt::format("__restvmx_{}", i), restvmx.base, restvmx.size, Symbol_Function); + } - auto& savevmx = functions.emplace_back(); - savevmx.base = config.saveVmx64Address + (i - 64) * 8; - savevmx.size = (128 - i) * 8 + 4; - image.symbols.emplace(fmt::format("__savevmx_{}", i), savevmx.base, savevmx.size, Symbol_Function); + if (config.saveVmx64Address != 0) + { + auto& savevmx = functions.emplace_back(); + savevmx.base = config.saveVmx64Address + (i - 64) * 8; + savevmx.size = (128 - i) * 8 + 4; + image.symbols.emplace(fmt::format("__savevmx_{}", i), savevmx.base, savevmx.size, Symbol_Function); + } } } @@ -929,17 +953,17 @@ bool Recompiler::Recompile( case PPC_INST_FCTID: printSetFlushMode(false); - println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : _mm_cvtsd_si64(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); + println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : simde_mm_cvtsd_si64(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); break; case PPC_INST_FCTIDZ: printSetFlushMode(false); - println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : _mm_cvttsd_si64(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); + println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : simde_mm_cvttsd_si64(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); break; case PPC_INST_FCTIWZ: printSetFlushMode(false); - println("\t{}.s64 = ({}.f64 > double(INT_MAX)) ? INT_MAX : _mm_cvttsd_si32(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); + println("\t{}.s64 = ({}.f64 > double(INT_MAX)) ? INT_MAX : simde_mm_cvttsd_si32(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); break; case PPC_INST_FDIV: @@ -1234,10 +1258,10 @@ bool Recompiler::Recompile( case PPC_INST_LVEHX: // NOTE: for endian swapping, we reverse the whole vector instead of individual elements. // this is accounted for in every instruction (eg. dp3 sums yzw instead of xyz) - print("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ((", v(insn.operands[0])); + print("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ((", v(insn.operands[0])); if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); - println("{}.u32) & ~0xF))), _mm_load_si128((__m128i*)VectorMaskL)));", r(insn.operands[2])); + println("{}.u32) & ~0xF))), simde_mm_load_si128((simde__m128i*)VectorMaskL)));", r(insn.operands[2])); break; case PPC_INST_LVLX: @@ -1246,7 +1270,7 @@ bool Recompiler::Recompile( if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); println("{}.u32;", r(insn.operands[2])); - println("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ({}.u32 & ~0xF))), _mm_load_si128((__m128i*)&VectorMaskL[({}.u32 & 0xF) * 16])));", v(insn.operands[0]), temp(), temp()); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ({}.u32 & ~0xF))), simde_mm_load_si128((simde__m128i*)&VectorMaskL[({}.u32 & 0xF) * 16])));", v(insn.operands[0]), temp(), temp()); break; case PPC_INST_LVRX: @@ -1255,7 +1279,7 @@ bool Recompiler::Recompile( if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); println("{}.u32;", r(insn.operands[2])); - println("\t_mm_store_si128((__m128i*){}.u8, {}.u32 & 0xF ? _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ({}.u32 & ~0xF))), _mm_load_si128((__m128i*)&VectorMaskR[({}.u32 & 0xF) * 16])) : _mm_setzero_si128());", v(insn.operands[0]), temp(), temp(), temp()); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, {}.u32 & 0xF ? simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ({}.u32 & ~0xF))), simde_mm_load_si128((simde__m128i*)&VectorMaskR[({}.u32 & 0xF) * 16])) : simde_mm_setzero_si128());", v(insn.operands[0]), temp(), temp(), temp()); break; case PPC_INST_LVSL: @@ -1263,7 +1287,7 @@ bool Recompiler::Recompile( if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); println("{}.u32;", r(insn.operands[2])); - println("\t_mm_store_si128((__m128i*){}.u8, _mm_load_si128((__m128i*)&VectorShiftTableL[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp()); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_load_si128((simde__m128i*)&VectorShiftTableL[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp()); break; case PPC_INST_LVSR: @@ -1271,7 +1295,7 @@ bool Recompiler::Recompile( if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); println("{}.u32;", r(insn.operands[2])); - println("\t_mm_store_si128((__m128i*){}.u8, _mm_load_si128((__m128i*)&VectorShiftTableR[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp()); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_load_si128((simde__m128i*)&VectorShiftTableR[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp()); break; case PPC_INST_LWA: @@ -1801,10 +1825,10 @@ bool Recompiler::Recompile( case PPC_INST_STVX: case PPC_INST_STVX128: - print("\t_mm_store_si128((__m128i*)(base + (("); + print("\tsimde_mm_store_si128((simde__m128i*)(base + (("); if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); - println("{}.u32) & ~0xF)), _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*)VectorMaskL)));", r(insn.operands[2]), v(insn.operands[0])); + println("{}.u32) & ~0xF)), simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*)VectorMaskL)));", r(insn.operands[2]), v(insn.operands[0])); break; case PPC_INST_STW: @@ -1911,7 +1935,7 @@ bool Recompiler::Recompile( case PPC_INST_VADDFP: case PPC_INST_VADDFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_add_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_add_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDSBS: @@ -1919,7 +1943,7 @@ bool Recompiler::Recompile( break; case PPC_INST_VADDSHS: - println("\t_mm_store_si128((__m128i*){}.s16, _mm_adds_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s16, simde_mm_adds_epi16(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDSWS: @@ -1932,45 +1956,45 @@ bool Recompiler::Recompile( break; case PPC_INST_VADDUBM: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_add_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_add_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDUBS: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_adds_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_adds_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDUHM: - println("\t_mm_store_si128((__m128i*){}.u16, _mm_add_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_add_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDUWM: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_add_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_add_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VADDUWS: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_adds_epu32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_adds_epu32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VAND: case PPC_INST_VAND128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_and_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VANDC: case PPC_INST_VANDC128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_andnot_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VAVGSB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VAVGSH: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epi16(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epi16(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VAVGUB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VAVGUH: @@ -1980,11 +2004,11 @@ bool Recompiler::Recompile( case PPC_INST_VCTSXS: case PPC_INST_VCFPSXWS128: printSetFlushMode(true); - print("\t_mm_store_si128((__m128i*){}.s32, _mm_vctsxs(", v(insn.operands[0])); + print("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_vctsxs(", v(insn.operands[0])); if (insn.operands[2] != 0) - println("_mm_mul_ps(_mm_load_ps({}.f32), _mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]); + println("simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]); else - println("_mm_load_ps({}.f32)));", v(insn.operands[1])); + println("simde_mm_load_ps({}.f32)));", v(insn.operands[1])); break; case PPC_INST_VCTUXS: @@ -2001,15 +2025,15 @@ bool Recompiler::Recompile( case PPC_INST_VCSXWFP128: { printSetFlushMode(true); - print("\t_mm_store_ps({}.f32, ", v(insn.operands[0])); + print("\tsimde_mm_store_ps({}.f32, ", v(insn.operands[0])); if (insn.operands[2] != 0) { const float value = ldexp(1.0f, -int32_t(insn.operands[2])); - println("_mm_mul_ps(_mm_cvtepi32_ps(_mm_load_si128((__m128i*){}.u32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast(&value)); + println("simde_mm_mul_ps(simde_mm_cvtepi32_ps(simde_mm_load_si128((simde__m128i*){}.u32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast(&value)); } else { - println("_mm_cvtepi32_ps(_mm_load_si128((__m128i*){}.u32)));", v(insn.operands[1])); + println("simde_mm_cvtepi32_ps(simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[1])); } break; } @@ -2018,15 +2042,15 @@ bool Recompiler::Recompile( case PPC_INST_VCUXWFP128: { printSetFlushMode(true); - print("\t_mm_store_ps({}.f32, ", v(insn.operands[0])); + print("\tsimde_mm_store_ps({}.f32, ", v(insn.operands[0])); if (insn.operands[2] != 0) { const float value = ldexp(1.0f, -int32_t(insn.operands[2])); - println("_mm_mul_ps(_mm_cvtepu32_ps_(_mm_load_si128((__m128i*){}.u32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast(&value)); + println("simde_mm_mul_ps(simde_mm_cvtepu32_ps_(simde_mm_load_si128((simde__m128i*){}.u32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast(&value)); } else { - println("_mm_cvtepu32_ps_(_mm_load_si128((__m128i*){}.u32)));", v(insn.operands[1])); + println("simde_mm_cvtepu32_ps_(simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[1])); } break; } @@ -2039,15 +2063,15 @@ bool Recompiler::Recompile( case PPC_INST_VCMPEQFP: case PPC_INST_VCMPEQFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_cmpeq_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpeq_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPEQUB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpeq_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_si128((simde__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPEQUH: @@ -2058,35 +2082,35 @@ bool Recompiler::Recompile( case PPC_INST_VCMPEQUW: case PPC_INST_VCMPEQUW128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpeq_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGEFP: case PPC_INST_VCMPGEFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_cmpge_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpge_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGTFP: case PPC_INST_VCMPGTFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_cmpgt_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpgt_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) - println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); + println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGTUB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpgt_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGTUH: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpgt_epu16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); if (strchr(insn.opcode->name, '.')) println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0])); break; @@ -2123,13 +2147,13 @@ bool Recompiler::Recompile( case PPC_INST_VMADDFP: case PPC_INST_VMADDFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_add_ps(_mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_add_ps(simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); break; case PPC_INST_VMAXFP: case PPC_INST_VMAXFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_max_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_max_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMAXSH: @@ -2137,7 +2161,7 @@ bool Recompiler::Recompile( break; case PPC_INST_VMAXSW: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_max_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_max_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMINSH: @@ -2147,55 +2171,55 @@ bool Recompiler::Recompile( case PPC_INST_VMINFP: case PPC_INST_VMINFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_min_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_min_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMRGHB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_unpackhi_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_unpackhi_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMRGHH: - println("\t_mm_store_si128((__m128i*){}.u16, _mm_unpackhi_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_unpackhi_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMRGHW: case PPC_INST_VMRGHW128: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_unpackhi_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_unpackhi_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMRGLB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_unpacklo_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_unpacklo_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMRGLH: - println("\t_mm_store_si128((__m128i*){}.u16, _mm_unpacklo_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_unpacklo_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMRGLW: case PPC_INST_VMRGLW128: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_unpacklo_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_unpacklo_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VMSUM3FP128: // NOTE: accounting for full vector reversal here. should dot product yzw instead of xyz printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_dp_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32), 0xEF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_dp_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32), 0xEF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMSUM4FP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_dp_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32), 0xFF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_dp_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32), 0xFF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VMULFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VNMSUBFP: case PPC_INST_VNMSUBFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_xor_ps(_mm_sub_ps(_mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)), _mm_load_ps({}.f32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x80000000)))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_xor_ps(simde_mm_sub_ps(simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)), simde_mm_load_ps({}.f32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x80000000)))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); break; case PPC_INST_VNOR: @@ -2210,18 +2234,18 @@ bool Recompiler::Recompile( case PPC_INST_VOR: case PPC_INST_VOR128: - print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0])); + print("\tsimde_mm_store_si128((simde__m128i*){}.u8, ", v(insn.operands[0])); if (insn.operands[1] != insn.operands[2]) - println("_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2])); + println("simde_mm_or_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2])); else - println("_mm_load_si128((__m128i*){}.u8));", v(insn.operands[1])); + println("simde_mm_load_si128((simde__m128i*){}.u8));", v(insn.operands[1])); break; case PPC_INST_VPERM: case PPC_INST_VPERM128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_perm_epi8_(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_perm_epi8_(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); break; case PPC_INST_VPERMWI128: @@ -2232,7 +2256,7 @@ bool Recompiler::Recompile( uint32_t z = 3 - ((insn.operands[2] >> 4) & 0x3); uint32_t w = 3 - ((insn.operands[2] >> 6) & 0x3); uint32_t perm = x | (y << 2) | (z << 4) | (w << 6); - println("\t_mm_store_si128((__m128i*){}.u32, _mm_shuffle_epi32(_mm_load_si128((__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_shuffle_epi32(simde_mm_load_si128((simde__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm); break; } @@ -2274,7 +2298,7 @@ bool Recompiler::Recompile( case PPC_INST_VPKSHUS: case PPC_INST_VPKSHUS128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_packus_epi16(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; case PPC_INST_VPKSWUS: @@ -2296,31 +2320,31 @@ bool Recompiler::Recompile( case PPC_INST_VREFP128: // TODO: see if we can use rcp safely printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_div_ps(_mm_set1_ps(1), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_div_ps(simde_mm_set1_ps(1), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VRFIM: case PPC_INST_VRFIM128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VRFIN: case PPC_INST_VRFIN128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VRFIZ: case PPC_INST_VRFIZ128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VRLIMI128: { - constexpr size_t shuffles[] = { _MM_SHUFFLE(3, 2, 1, 0), _MM_SHUFFLE(2, 1, 0, 3), _MM_SHUFFLE(1, 0, 3, 2), _MM_SHUFFLE(0, 3, 2, 1) }; - println("\t_mm_store_ps({}.f32, _mm_blend_ps(_mm_load_ps({}.f32), _mm_permute_ps(_mm_load_ps({}.f32), {}), {}));", v(insn.operands[0]), v(insn.operands[0]), v(insn.operands[1]), shuffles[insn.operands[3]], insn.operands[2]); + constexpr size_t shuffles[] = { SIMDE_MM_SHUFFLE(3, 2, 1, 0), SIMDE_MM_SHUFFLE(2, 1, 0, 3), SIMDE_MM_SHUFFLE(1, 0, 3, 2), SIMDE_MM_SHUFFLE(0, 3, 2, 1) }; + println("\tsimde_mm_store_ps({}.f32, simde_mm_blend_ps(simde_mm_load_ps({}.f32), simde_mm_permute_ps(simde_mm_load_ps({}.f32), {}), {}));", v(insn.operands[0]), v(insn.operands[0]), v(insn.operands[1]), shuffles[insn.operands[3]], insn.operands[2]); break; } @@ -2337,12 +2361,12 @@ bool Recompiler::Recompile( // TODO: see if we can use rsqrt safely // TODO: we can detect if the input is from a dot product and apply logic only on one value printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_div_ps(_mm_set1_ps(1), _mm_sqrt_ps(_mm_load_ps({}.f32))));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_div_ps(simde_mm_set1_ps(1), simde_mm_sqrt_ps(simde_mm_load_ps({}.f32))));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VSEL: case PPC_INST_VSEL128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_or_si128(_mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_or_si128(simde_mm_andnot_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)), simde_mm_and_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2])); break; case PPC_INST_VSLB: @@ -2359,7 +2383,7 @@ bool Recompiler::Recompile( case PPC_INST_VSLDOI: case PPC_INST_VSLDOI128: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_alignr_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_alignr_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]); break; case PPC_INST_VSLW: @@ -2373,7 +2397,7 @@ bool Recompiler::Recompile( { // NOTE: accounting for full vector reversal here uint32_t perm = 15 - insn.operands[2]; - println("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u8), _mm_set1_epi8(char(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_set1_epi8(char(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm); break; } @@ -2382,12 +2406,12 @@ bool Recompiler::Recompile( // NOTE: accounting for full vector reversal here uint32_t perm = 7 - insn.operands[2]; perm = (perm * 2) | ((perm * 2 + 1) << 8); - println("\t_mm_store_si128((__m128i*){}.u16, _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u16), _mm_set1_epi16(short(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm); + println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_set1_epi16(short(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm); break; } case PPC_INST_VSPLTISB: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]); break; case PPC_INST_VSPLTISH: @@ -2396,7 +2420,7 @@ bool Recompiler::Recompile( case PPC_INST_VSPLTISW: case PPC_INST_VSPLTISW128: - println("\t_mm_store_si128((__m128i*){}.u32, _mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); break; case PPC_INST_VSPLTW: @@ -2405,12 +2429,12 @@ bool Recompiler::Recompile( // NOTE: accounting for full vector reversal here uint32_t perm = 3 - insn.operands[2]; perm |= (perm << 2) | (perm << 4) | (perm << 6); - println("\t_mm_store_si128((__m128i*){}.u32, _mm_shuffle_epi32(_mm_load_si128((__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm); + println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_shuffle_epi32(simde_mm_load_si128((simde__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm); break; } case PPC_INST_VSR: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_vsr(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_vsr(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VSRAB: @@ -2448,7 +2472,7 @@ bool Recompiler::Recompile( case PPC_INST_VSUBFP: case PPC_INST_VSUBFP128: printSetFlushMode(true); - println("\t_mm_store_ps({}.f32, _mm_sub_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_ps({}.f32, simde_mm_sub_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VSUBSHS: @@ -2470,7 +2494,7 @@ bool Recompiler::Recompile( break; case PPC_INST_VSUBUBS: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_subs_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_subs_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VSUBUBM: @@ -2515,32 +2539,32 @@ bool Recompiler::Recompile( case PPC_INST_VUPKHSB: case PPC_INST_VUPKHSB128: - println("\t_mm_store_si128((__m128i*){}.s16, _mm_cvtepi8_epi16(_mm_unpackhi_epi64(_mm_load_si128((__m128i*){}.s8), _mm_load_si128((__m128i*){}.s8))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s16, simde_mm_cvtepi8_epi16(simde_mm_unpackhi_epi64(simde_mm_load_si128((simde__m128i*){}.s8), simde_mm_load_si128((simde__m128i*){}.s8))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1])); break; case PPC_INST_VUPKHSH: case PPC_INST_VUPKHSH128: - println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi16_epi32(_mm_unpackhi_epi64(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi16_epi32(simde_mm_unpackhi_epi64(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1])); break; case PPC_INST_VUPKLSB: case PPC_INST_VUPKLSB128: - println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi8_epi16(_mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi8_epi16(simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VUPKLSH: case PPC_INST_VUPKLSH128: - println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi16_epi32(_mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1])); + println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi16_epi32(simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1])); break; case PPC_INST_VXOR: case PPC_INST_VXOR128: - print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0])); + print("\tsimde_mm_store_si128((simde__m128i*){}.u8, ", v(insn.operands[0])); if (insn.operands[1] != insn.operands[2]) - println("_mm_xor_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2])); + println("simde_mm_xor_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2])); else - println("_mm_setzero_si128());"); + println("simde_mm_setzero_si128());"); break; @@ -2676,7 +2700,10 @@ bool Recompiler::Recompile(const Function& fn) name = fmt::format("sub_{}", fn.base); } +#ifdef XENON_RECOMP_USE_ALIAS println("__attribute__((alias(\"__imp__{}\"))) PPC_WEAK_FUNC({});", name, name); +#endif + println("PPC_FUNC_IMPL(__imp__{}) {{", name); println("\tPPC_FUNC_PROLOGUE();"); @@ -2737,6 +2764,12 @@ bool Recompiler::Recompile(const Function& fn) println("}}\n"); +#ifndef XENON_RECOMP_USE_ALIAS + println("PPC_WEAK_FUNC({}) {{", name); + println("\t__imp__{}(ctx, base);", name); + println("}}\n"); +#endif + std::swap(out, tempString); if (localVariables.ctr) println("\tPPCRegister ctr{{}};"); diff --git a/XenonRecomp/recompiler_config.cpp b/XenonRecomp/recompiler_config.cpp index d746b68..81330a4 100644 --- a/XenonRecomp/recompiler_config.cpp +++ b/XenonRecomp/recompiler_config.cpp @@ -38,6 +38,15 @@ void RecompilerConfig::Load(const std::string_view& configFilePath) longJmpAddress = main["longjmp_address"].value_or(0u); setJmpAddress = main["setjmp_address"].value_or(0u); + if (restGpr14Address == 0) fmt::println("ERROR: __restgprlr_14 address is unspecified"); + if (saveGpr14Address == 0) fmt::println("ERROR: __savegprlr_14 address is unspecified"); + if (restFpr14Address == 0) fmt::println("ERROR: __restfpr_14 address is unspecified"); + if (saveFpr14Address == 0) fmt::println("ERROR: __savefpr_14 address is unspecified"); + if (restVmx14Address == 0) fmt::println("ERROR: __restvmx_14 address is unspecified"); + if (saveVmx14Address == 0) fmt::println("ERROR: __savevmx_14 address is unspecified"); + if (restVmx64Address == 0) fmt::println("ERROR: __restvmx_64 address is unspecified"); + if (saveVmx64Address == 0) fmt::println("ERROR: __savevmx_64 address is unspecified"); + if (auto functionsArray = main["functions"].as_array()) { for (auto& func : *functionsArray) diff --git a/XenonUtils/CMakeLists.txt b/XenonUtils/CMakeLists.txt index b8ca66f..1807323 100644 --- a/XenonUtils/CMakeLists.txt +++ b/XenonUtils/CMakeLists.txt @@ -17,8 +17,9 @@ target_compile_definitions(XenonUtils ) target_include_directories(XenonUtils - PUBLIC + PUBLIC . + "${THIRDPARTY_ROOT}/simde" PRIVATE "${THIRDPARTY_ROOT}/libmspack/libmspack/mspack" "${THIRDPARTY_ROOT}/tiny-AES-c" diff --git a/XenonUtils/image.h b/XenonUtils/image.h index e443852..be61119 100644 --- a/XenonUtils/image.h +++ b/XenonUtils/image.h @@ -2,7 +2,6 @@ #include #include #include -#include #include #include "symbol_table.h" diff --git a/XenonUtils/ppc_context.h b/XenonUtils/ppc_context.h index de29be2..6996f7c 100644 --- a/XenonUtils/ppc_context.h +++ b/XenonUtils/ppc_context.h @@ -12,13 +12,13 @@ #include #include -#include +#include +#include +#include -#ifdef _WIN32 -#include -#else -#include -#include +// SSE3 constants are missing from simde +#ifndef _MM_DENORMALS_ZERO_MASK +#define _MM_DENORMALS_ZERO_MASK 0x0040 #endif #define PPC_JOIN(x, y) x##y @@ -175,18 +175,18 @@ struct PPCCRRegister eq = !un && (left == right); } - inline void setFromMask(__m128 mask, int imm) noexcept + inline void setFromMask(simde__m128 mask, int imm) noexcept { - int m = _mm_movemask_ps(mask); + int m = simde_mm_movemask_ps(mask); lt = m == imm; // all equal gt = 0; eq = m == 0; // none equal so = 0; } - inline void setFromMask(__m128i mask, int imm) noexcept + inline void setFromMask(simde__m128i mask, int imm) noexcept { - int m = _mm_movemask_epi8(mask); + int m = simde_mm_movemask_epi8(mask); lt = m == imm; // all equal gt = 0; eq = m == 0; // none equal @@ -221,34 +221,71 @@ struct PPCFPSCRRegister { uint32_t csr; - static constexpr size_t GuestToHost[] = { _MM_ROUND_NEAREST, _MM_ROUND_TOWARD_ZERO, _MM_ROUND_UP, _MM_ROUND_DOWN }; static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO }; + // simde does not handle denormal flags, so we need to implement per-arch. +#if defined(__x86_64__) || defined(_M_X64) + static constexpr size_t RoundShift = 13; + static constexpr size_t RoundMask = SIMDE_MM_ROUND_MASK; + static constexpr size_t FlushMask = SIMDE_MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK; + static constexpr size_t GuestToHost[] = { SIMDE_MM_ROUND_NEAREST, SIMDE_MM_ROUND_TOWARD_ZERO, SIMDE_MM_ROUND_UP, SIMDE_MM_ROUND_DOWN }; + + inline uint32_t getcsr() noexcept + { + return simde_mm_getcsr(); + } + + inline void setcsr(uint32_t csr) noexcept + { + simde_mm_setcsr(csr); + } +#elif defined(__aarch64__) || defined(_M_ARM64) + // RMode + static constexpr size_t RoundShift = 22; + static constexpr size_t RoundMask = 3 << RoundShift; + // FZ and FZ16 + static constexpr size_t FlushMask = (1 << 19) | (1 << 24); + // Nearest, Zero, -Infinity, -Infinity + static constexpr size_t GuestToHost[] = { 0 << RoundShift, 3 << RoundShift, 1 << RoundShift, 2 << RoundShift }; + + inline uint32_t getcsr() noexcept + { + uint64_t csr; + __asm__ __volatile__("mrs %0, fpcr" : "=r"(csr)); + return csr; + } + + inline void setcsr(uint32_t csr) noexcept + { + __asm__ __volatile__("msr fpcr, %0" : : "r"(csr)); + } +#else +# error "Missing implementation for FPSCR." +#endif + inline uint32_t loadFromHost() noexcept { - csr = _mm_getcsr(); - return HostToGuest[(csr & _MM_ROUND_MASK) >> 13]; + csr = getcsr(); + return HostToGuest[(csr & RoundMask) >> RoundShift]; } inline void storeFromGuest(uint32_t value) noexcept { - csr &= ~_MM_ROUND_MASK; + csr &= ~RoundMask; csr |= GuestToHost[value & PPC_ROUND_MASK]; - _mm_setcsr(csr); + setcsr(csr); } - static constexpr size_t FlushMask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK; - inline void enableFlushModeUnconditional() noexcept { csr |= FlushMask; - _mm_setcsr(csr); + setcsr(csr); } inline void disableFlushModeUnconditional() noexcept { csr &= ~FlushMask; - _mm_setcsr(csr); + setcsr(csr); } inline void enableFlushMode() noexcept @@ -256,7 +293,7 @@ struct PPCFPSCRRegister if ((csr & FlushMask) != FlushMask) [[unlikely]] { csr |= FlushMask; - _mm_setcsr(csr); + setcsr(csr); } } @@ -265,7 +302,7 @@ struct PPCFPSCRRegister if ((csr & FlushMask) != 0) [[unlikely]] { csr &= ~FlushMask; - _mm_setcsr(csr); + setcsr(csr); } } }; @@ -593,81 +630,94 @@ inline uint8_t VectorShiftTableR[] = 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, }; -inline __m128i _mm_adds_epu32(__m128i a, __m128i b) +inline simde__m128i simde_mm_adds_epu32(simde__m128i a, simde__m128i b) { - return _mm_add_epi32(a, _mm_min_epu32(_mm_xor_si128(a, _mm_cmpeq_epi32(a, a)), b)); + return simde_mm_add_epi32(a, simde_mm_min_epu32(simde_mm_xor_si128(a, simde_mm_cmpeq_epi32(a, a)), b)); } -inline __m128i _mm_avg_epi8(__m128i a, __m128i b) +inline simde__m128i simde_mm_avg_epi8(simde__m128i a, simde__m128i b) { - __m128i c = _mm_set1_epi8(char(128)); - return _mm_xor_si128(c, _mm_avg_epu8(_mm_xor_si128(c, a), _mm_xor_si128(c, b))); + simde__m128i c = simde_mm_set1_epi8(char(128)); + return simde_mm_xor_si128(c, simde_mm_avg_epu8(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b))); } -inline __m128i _mm_avg_epi16(__m128i a, __m128i b) +inline simde__m128i simde_mm_avg_epi16(simde__m128i a, simde__m128i b) { - __m128i c = _mm_set1_epi16(short(32768)); - return _mm_xor_si128(c, _mm_avg_epu16(_mm_xor_si128(c, a), _mm_xor_si128(c, b))); + simde__m128i c = simde_mm_set1_epi16(short(32768)); + return simde_mm_xor_si128(c, simde_mm_avg_epu16(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b))); } -inline __m128 _mm_cvtepu32_ps_(__m128i src1) +inline simde__m128 simde_mm_cvtepu32_ps_(simde__m128i src1) { - __m128i xmm1 = _mm_add_epi32(src1, _mm_set1_epi32(127)); - __m128i xmm0 = _mm_slli_epi32(src1, 31 - 8); - xmm0 = _mm_srli_epi32(xmm0, 31); - xmm0 = _mm_add_epi32(xmm0, xmm1); - xmm0 = _mm_srai_epi32(xmm0, 8); - xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(0x4F800000)); - __m128 xmm2 = _mm_cvtepi32_ps(src1); - return _mm_blendv_ps(xmm2, _mm_castsi128_ps(xmm0), _mm_castsi128_ps(src1)); + simde__m128i xmm1 = simde_mm_add_epi32(src1, simde_mm_set1_epi32(127)); + simde__m128i xmm0 = simde_mm_slli_epi32(src1, 31 - 8); + xmm0 = simde_mm_srli_epi32(xmm0, 31); + xmm0 = simde_mm_add_epi32(xmm0, xmm1); + xmm0 = simde_mm_srai_epi32(xmm0, 8); + xmm0 = simde_mm_add_epi32(xmm0, simde_mm_set1_epi32(0x4F800000)); + simde__m128 xmm2 = simde_mm_cvtepi32_ps(src1); + return simde_mm_blendv_ps(xmm2, simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(src1)); } -inline __m128i _mm_perm_epi8_(__m128i a, __m128i b, __m128i c) +inline simde__m128i simde_mm_perm_epi8_(simde__m128i a, simde__m128i b, simde__m128i c) { - __m128i d = _mm_set1_epi8(0xF); - __m128i e = _mm_sub_epi8(d, _mm_and_si128(c, d)); - return _mm_blendv_epi8(_mm_shuffle_epi8(a, e), _mm_shuffle_epi8(b, e), _mm_slli_epi32(c, 3)); + simde__m128i d = simde_mm_set1_epi8(0xF); + simde__m128i e = simde_mm_sub_epi8(d, simde_mm_and_si128(c, d)); + return simde_mm_blendv_epi8(simde_mm_shuffle_epi8(a, e), simde_mm_shuffle_epi8(b, e), simde_mm_slli_epi32(c, 3)); } -inline __m128i _mm_cmpgt_epu8(__m128i a, __m128i b) +inline simde__m128i simde_mm_cmpgt_epu8(simde__m128i a, simde__m128i b) { - __m128i c = _mm_set1_epi8(char(128)); - return _mm_cmpgt_epi8(_mm_xor_si128(a, c), _mm_xor_si128(b, c)); + simde__m128i c = simde_mm_set1_epi8(char(128)); + return simde_mm_cmpgt_epi8(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c)); } -inline __m128i _mm_cmpgt_epu16(__m128i a, __m128i b) +inline simde__m128i simde_mm_cmpgt_epu16(simde__m128i a, simde__m128i b) { - __m128i c = _mm_set1_epi16(short(32768)); - return _mm_cmpgt_epi16(_mm_xor_si128(a, c), _mm_xor_si128(b, c)); + simde__m128i c = simde_mm_set1_epi16(short(32768)); + return simde_mm_cmpgt_epi16(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c)); } -inline __m128i _mm_vctsxs(__m128 src1) +inline simde__m128i simde_mm_vctsxs(simde__m128 src1) { - __m128 xmm2 = _mm_cmpunord_ps(src1, src1); - __m128i xmm0 = _mm_cvttps_epi32(src1); - __m128i xmm1 = _mm_cmpeq_epi32(xmm0, _mm_set1_epi32(INT_MIN)); - xmm1 = _mm_andnot_si128(_mm_castps_si128(src1), xmm1); - __m128 dest = _mm_blendv_ps(_mm_castsi128_ps(xmm0), _mm_castsi128_ps(_mm_set1_epi32(INT_MAX)), _mm_castsi128_ps(xmm1)); - return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest)); + simde__m128 xmm2 = simde_mm_cmpunord_ps(src1, src1); + simde__m128i xmm0 = simde_mm_cvttps_epi32(src1); + simde__m128i xmm1 = simde_mm_cmpeq_epi32(xmm0, simde_mm_set1_epi32(INT_MIN)); + xmm1 = simde_mm_andnot_si128(simde_mm_castps_si128(src1), xmm1); + simde__m128 dest = simde_mm_blendv_ps(simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(simde_mm_set1_epi32(INT_MAX)), simde_mm_castsi128_ps(xmm1)); + return simde_mm_andnot_si128(simde_mm_castps_si128(xmm2), simde_mm_castps_si128(dest)); } -inline __m128i _mm_vctuxs(__m128 src1) +inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b) { - __m128 xmm0 = _mm_max_ps(src1, _mm_set1_epi32(0)); - __m128 xmm1 = _mm_cmpge_ps(xmm0, _mm_set1_ps((float)0x80000000)); - __m128 xmm2 = _mm_sub_ps(xmm0, _mm_set1_ps((float)0x80000000)); - xmm0 = _mm_blendv_ps(xmm0, xmm2, xmm1); - __m128i dest = _mm_cvttps_epi32(xmm0); - xmm0 = _mm_cmpeq_epi32(dest, _mm_set1_epi32(INT_MIN)); - xmm1 = _mm_and_si128(xmm1, _mm_set1_epi32(INT_MIN)); - dest = _mm_add_epi32(dest, xmm1); - return _mm_or_si128(dest, xmm0); + b = simde_mm_srli_epi64(simde_mm_slli_epi64(b, 61), 61); + return simde_mm_castps_si128(simde_mm_insert_ps(simde_mm_castsi128_ps(simde_mm_srl_epi64(a, b)), simde_mm_castsi128_ps(simde_mm_srl_epi64(simde_mm_srli_si128(a, 4), b)), 0x10)); } -inline __m128i _mm_vsr(__m128i a, __m128i b) +inline simde__m128i simde_mm_vctuxs(simde__m128 src1) { - b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61); - return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10)); + simde__m128 xmm0 = simde_mm_max_ps(src1, simde_mm_set1_epi32(0)); + simde__m128 xmm1 = simde_mm_cmpge_ps(xmm0, simde_mm_set1_ps((float)0x80000000)); + simde__m128 xmm2 = simde_mm_sub_ps(xmm0, simde_mm_set1_ps((float)0x80000000)); + xmm0 = simde_mm_blendv_ps(xmm0, xmm2, xmm1); + simde__m128i dest = simde_mm_cvttps_epi32(xmm0); + xmm0 = simde_mm_cmpeq_epi32(dest, simde_mm_set1_epi32(INT_MIN)); + xmm1 = simde_mm_and_si128(xmm1, simde_mm_set1_epi32(INT_MIN)); + dest = simde_mm_add_epi32(dest, xmm1); + return simde_mm_or_si128(dest, xmm0); } +inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b) +#if defined(__aarch64__) || defined(_M_ARM64) +inline uint64_t __rdtsc() +{ + uint64_t ret; + asm volatile("mrs %0, cntvct_el0\n\t" + : "=r"(ret)::"memory"); + return ret; +} +#elif !defined(__x86_64__) && !defined(_M_X64) +# error "Missing implementation for __rdtsc()" +#endif + #endif diff --git a/thirdparty/simde b/thirdparty/simde new file mode 160000 index 0000000..a532a12 --- /dev/null +++ b/thirdparty/simde @@ -0,0 +1 @@ +Subproject commit a532a12ca9bbdc5e6547eb602e6256b71a5377d4