Merge branch 'simde' of https://github.com/IsaacMarovitz/XenonRecomp into NG2_MoreInstructions

This commit is contained in:
Ultra TM 2025-03-06 00:10:44 +01:00
commit 5bc2b301b2
9 changed files with 292 additions and 191 deletions

3
.gitmodules vendored
View file

@ -13,3 +13,6 @@
[submodule "thirdparty/tiny-AES-c"] [submodule "thirdparty/tiny-AES-c"]
path = thirdparty/tiny-AES-c path = thirdparty/tiny-AES-c
url = https://github.com/kokke/tiny-AES-c.git url = https://github.com/kokke/tiny-AES-c.git
[submodule "thirdparty/simde"]
path = thirdparty/simde
url = https://github.com/simd-everywhere/simde-no-tests.git

View file

@ -19,6 +19,11 @@ target_link_libraries(XenonRecomp PRIVATE
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
target_compile_options(XenonRecomp PRIVATE -Wno-switch -Wno-unused-variable -Wno-null-arithmetic) target_compile_options(XenonRecomp PRIVATE -Wno-switch -Wno-unused-variable -Wno-null-arithmetic)
# alias attribute not supported on Apple.
if (NOT APPLE)
target_compile_definitions(XenonRecomp PRIVATE XENON_RECOMP_USE_ALIAS)
endif()
endif() endif()
target_compile_definitions(XenonRecomp PRIVATE _CRT_SECURE_NO_WARNINGS) target_compile_definitions(XenonRecomp PRIVATE _CRT_SECURE_NO_WARNINGS)

View file

@ -16,4 +16,4 @@
#include <xbox.h> #include <xbox.h>
#include <xxhash.h> #include <xxhash.h>
#include <fmt/core.h> #include <fmt/core.h>
#include <xmmintrin.h> #include <x86/sse.h>

View file

@ -99,48 +99,72 @@ void Recompiler::Analyse()
{ {
if (i < 32) if (i < 32)
{ {
auto& restgpr = functions.emplace_back(); if (config.restGpr14Address != 0)
restgpr.base = config.restGpr14Address + (i - 14) * 4; {
restgpr.size = (32 - i) * 4 + 12; auto& restgpr = functions.emplace_back();
image.symbols.emplace(Symbol{ fmt::format("__restgprlr_{}", i), restgpr.base, restgpr.size, Symbol_Function }); restgpr.base = config.restGpr14Address + (i - 14) * 4;
restgpr.size = (32 - i) * 4 + 12;
image.symbols.emplace(Symbol{ fmt::format("__restgprlr_{}", i), restgpr.base, restgpr.size, Symbol_Function });
}
auto& savegpr = functions.emplace_back(); if (config.saveGpr14Address != 0)
savegpr.base = config.saveGpr14Address + (i - 14) * 4; {
savegpr.size = (32 - i) * 4 + 8; auto& savegpr = functions.emplace_back();
image.symbols.emplace(fmt::format("__savegprlr_{}", i), savegpr.base, savegpr.size, Symbol_Function); savegpr.base = config.saveGpr14Address + (i - 14) * 4;
savegpr.size = (32 - i) * 4 + 8;
image.symbols.emplace(fmt::format("__savegprlr_{}", i), savegpr.base, savegpr.size, Symbol_Function);
}
auto& restfpr = functions.emplace_back(); if (config.restFpr14Address != 0)
restfpr.base = config.restFpr14Address + (i - 14) * 4; {
restfpr.size = (32 - i) * 4 + 4; auto& restfpr = functions.emplace_back();
image.symbols.emplace(fmt::format("__restfpr_{}", i), restfpr.base, restfpr.size, Symbol_Function); restfpr.base = config.restFpr14Address + (i - 14) * 4;
restfpr.size = (32 - i) * 4 + 4;
image.symbols.emplace(fmt::format("__restfpr_{}", i), restfpr.base, restfpr.size, Symbol_Function);
}
auto& savefpr = functions.emplace_back(); if (config.saveFpr14Address != 0)
savefpr.base = config.saveFpr14Address + (i - 14) * 4; {
savefpr.size = (32 - i) * 4 + 4; auto& savefpr = functions.emplace_back();
image.symbols.emplace(fmt::format("__savefpr_{}", i), savefpr.base, savefpr.size, Symbol_Function); savefpr.base = config.saveFpr14Address + (i - 14) * 4;
savefpr.size = (32 - i) * 4 + 4;
image.symbols.emplace(fmt::format("__savefpr_{}", i), savefpr.base, savefpr.size, Symbol_Function);
}
auto& restvmx = functions.emplace_back(); if (config.restVmx14Address != 0)
restvmx.base = config.restVmx14Address + (i - 14) * 8; {
restvmx.size = (32 - i) * 8 + 4; auto& restvmx = functions.emplace_back();
image.symbols.emplace(fmt::format("__restvmx_{}", i), restvmx.base, restvmx.size, Symbol_Function); restvmx.base = config.restVmx14Address + (i - 14) * 8;
restvmx.size = (32 - i) * 8 + 4;
image.symbols.emplace(fmt::format("__restvmx_{}", i), restvmx.base, restvmx.size, Symbol_Function);
}
auto& savevmx = functions.emplace_back(); if (config.saveVmx14Address != 0)
savevmx.base = config.saveVmx14Address + (i - 14) * 8; {
savevmx.size = (32 - i) * 8 + 4; auto& savevmx = functions.emplace_back();
image.symbols.emplace(fmt::format("__savevmx_{}", i), savevmx.base, savevmx.size, Symbol_Function); savevmx.base = config.saveVmx14Address + (i - 14) * 8;
savevmx.size = (32 - i) * 8 + 4;
image.symbols.emplace(fmt::format("__savevmx_{}", i), savevmx.base, savevmx.size, Symbol_Function);
}
} }
if (i >= 64) if (i >= 64)
{ {
auto& restvmx = functions.emplace_back(); if (config.restVmx64Address != 0)
restvmx.base = config.restVmx64Address + (i - 64) * 8; {
restvmx.size = (128 - i) * 8 + 4; auto& restvmx = functions.emplace_back();
image.symbols.emplace(fmt::format("__restvmx_{}", i), restvmx.base, restvmx.size, Symbol_Function); restvmx.base = config.restVmx64Address + (i - 64) * 8;
restvmx.size = (128 - i) * 8 + 4;
image.symbols.emplace(fmt::format("__restvmx_{}", i), restvmx.base, restvmx.size, Symbol_Function);
}
auto& savevmx = functions.emplace_back(); if (config.saveVmx64Address != 0)
savevmx.base = config.saveVmx64Address + (i - 64) * 8; {
savevmx.size = (128 - i) * 8 + 4; auto& savevmx = functions.emplace_back();
image.symbols.emplace(fmt::format("__savevmx_{}", i), savevmx.base, savevmx.size, Symbol_Function); savevmx.base = config.saveVmx64Address + (i - 64) * 8;
savevmx.size = (128 - i) * 8 + 4;
image.symbols.emplace(fmt::format("__savevmx_{}", i), savevmx.base, savevmx.size, Symbol_Function);
}
} }
} }
@ -929,17 +953,17 @@ bool Recompiler::Recompile(
case PPC_INST_FCTID: case PPC_INST_FCTID:
printSetFlushMode(false); printSetFlushMode(false);
println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : _mm_cvtsd_si64(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : simde_mm_cvtsd_si64(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1]));
break; break;
case PPC_INST_FCTIDZ: case PPC_INST_FCTIDZ:
printSetFlushMode(false); printSetFlushMode(false);
println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : _mm_cvttsd_si64(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : simde_mm_cvttsd_si64(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1]));
break; break;
case PPC_INST_FCTIWZ: case PPC_INST_FCTIWZ:
printSetFlushMode(false); printSetFlushMode(false);
println("\t{}.s64 = ({}.f64 > double(INT_MAX)) ? INT_MAX : _mm_cvttsd_si32(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1])); println("\t{}.s64 = ({}.f64 > double(INT_MAX)) ? INT_MAX : simde_mm_cvttsd_si32(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1]));
break; break;
case PPC_INST_FDIV: case PPC_INST_FDIV:
@ -1234,10 +1258,10 @@ bool Recompiler::Recompile(
case PPC_INST_LVEHX: case PPC_INST_LVEHX:
// NOTE: for endian swapping, we reverse the whole vector instead of individual elements. // NOTE: for endian swapping, we reverse the whole vector instead of individual elements.
// this is accounted for in every instruction (eg. dp3 sums yzw instead of xyz) // this is accounted for in every instruction (eg. dp3 sums yzw instead of xyz)
print("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ((", v(insn.operands[0])); print("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ((", v(insn.operands[0]));
if (insn.operands[1] != 0) if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1])); print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32) & ~0xF))), _mm_load_si128((__m128i*)VectorMaskL)));", r(insn.operands[2])); println("{}.u32) & ~0xF))), simde_mm_load_si128((simde__m128i*)VectorMaskL)));", r(insn.operands[2]));
break; break;
case PPC_INST_LVLX: case PPC_INST_LVLX:
@ -1246,7 +1270,7 @@ bool Recompiler::Recompile(
if (insn.operands[1] != 0) if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1])); print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32;", r(insn.operands[2])); println("{}.u32;", r(insn.operands[2]));
println("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ({}.u32 & ~0xF))), _mm_load_si128((__m128i*)&VectorMaskL[({}.u32 & 0xF) * 16])));", v(insn.operands[0]), temp(), temp()); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ({}.u32 & ~0xF))), simde_mm_load_si128((simde__m128i*)&VectorMaskL[({}.u32 & 0xF) * 16])));", v(insn.operands[0]), temp(), temp());
break; break;
case PPC_INST_LVRX: case PPC_INST_LVRX:
@ -1255,7 +1279,7 @@ bool Recompiler::Recompile(
if (insn.operands[1] != 0) if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1])); print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32;", r(insn.operands[2])); println("{}.u32;", r(insn.operands[2]));
println("\t_mm_store_si128((__m128i*){}.u8, {}.u32 & 0xF ? _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ({}.u32 & ~0xF))), _mm_load_si128((__m128i*)&VectorMaskR[({}.u32 & 0xF) * 16])) : _mm_setzero_si128());", v(insn.operands[0]), temp(), temp(), temp()); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, {}.u32 & 0xF ? simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ({}.u32 & ~0xF))), simde_mm_load_si128((simde__m128i*)&VectorMaskR[({}.u32 & 0xF) * 16])) : simde_mm_setzero_si128());", v(insn.operands[0]), temp(), temp(), temp());
break; break;
case PPC_INST_LVSL: case PPC_INST_LVSL:
@ -1263,7 +1287,7 @@ bool Recompiler::Recompile(
if (insn.operands[1] != 0) if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1])); print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32;", r(insn.operands[2])); println("{}.u32;", r(insn.operands[2]));
println("\t_mm_store_si128((__m128i*){}.u8, _mm_load_si128((__m128i*)&VectorShiftTableL[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp()); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_load_si128((simde__m128i*)&VectorShiftTableL[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp());
break; break;
case PPC_INST_LVSR: case PPC_INST_LVSR:
@ -1271,7 +1295,7 @@ bool Recompiler::Recompile(
if (insn.operands[1] != 0) if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1])); print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32;", r(insn.operands[2])); println("{}.u32;", r(insn.operands[2]));
println("\t_mm_store_si128((__m128i*){}.u8, _mm_load_si128((__m128i*)&VectorShiftTableR[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp()); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_load_si128((simde__m128i*)&VectorShiftTableR[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp());
break; break;
case PPC_INST_LWA: case PPC_INST_LWA:
@ -1801,10 +1825,10 @@ bool Recompiler::Recompile(
case PPC_INST_STVX: case PPC_INST_STVX:
case PPC_INST_STVX128: case PPC_INST_STVX128:
print("\t_mm_store_si128((__m128i*)(base + (("); print("\tsimde_mm_store_si128((simde__m128i*)(base + ((");
if (insn.operands[1] != 0) if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1])); print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32) & ~0xF)), _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*)VectorMaskL)));", r(insn.operands[2]), v(insn.operands[0])); println("{}.u32) & ~0xF)), simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*)VectorMaskL)));", r(insn.operands[2]), v(insn.operands[0]));
break; break;
case PPC_INST_STW: case PPC_INST_STW:
@ -1911,7 +1935,7 @@ bool Recompiler::Recompile(
case PPC_INST_VADDFP: case PPC_INST_VADDFP:
case PPC_INST_VADDFP128: case PPC_INST_VADDFP128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_add_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_ps({}.f32, simde_mm_add_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VADDSBS: case PPC_INST_VADDSBS:
@ -1919,7 +1943,7 @@ bool Recompiler::Recompile(
break; break;
case PPC_INST_VADDSHS: case PPC_INST_VADDSHS:
println("\t_mm_store_si128((__m128i*){}.s16, _mm_adds_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.s16, simde_mm_adds_epi16(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VADDSWS: case PPC_INST_VADDSWS:
@ -1932,45 +1956,45 @@ bool Recompiler::Recompile(
break; break;
case PPC_INST_VADDUBM: case PPC_INST_VADDUBM:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_add_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_add_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VADDUBS: case PPC_INST_VADDUBS:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_adds_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_adds_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VADDUHM: case PPC_INST_VADDUHM:
println("\t_mm_store_si128((__m128i*){}.u16, _mm_add_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_add_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VADDUWM: case PPC_INST_VADDUWM:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_add_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_add_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VADDUWS: case PPC_INST_VADDUWS:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_adds_epu32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_adds_epu32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VAND: case PPC_INST_VAND:
case PPC_INST_VAND128: case PPC_INST_VAND128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_and_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VANDC: case PPC_INST_VANDC:
case PPC_INST_VANDC128: case PPC_INST_VANDC128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_andnot_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break; break;
case PPC_INST_VAVGSB: case PPC_INST_VAVGSB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VAVGSH: case PPC_INST_VAVGSH:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epi16(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epi16(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VAVGUB: case PPC_INST_VAVGUB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VAVGUH: case PPC_INST_VAVGUH:
@ -1980,11 +2004,11 @@ bool Recompiler::Recompile(
case PPC_INST_VCTSXS: case PPC_INST_VCTSXS:
case PPC_INST_VCFPSXWS128: case PPC_INST_VCFPSXWS128:
printSetFlushMode(true); printSetFlushMode(true);
print("\t_mm_store_si128((__m128i*){}.s32, _mm_vctsxs(", v(insn.operands[0])); print("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_vctsxs(", v(insn.operands[0]));
if (insn.operands[2] != 0) if (insn.operands[2] != 0)
println("_mm_mul_ps(_mm_load_ps({}.f32), _mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]); println("simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]);
else else
println("_mm_load_ps({}.f32)));", v(insn.operands[1])); println("simde_mm_load_ps({}.f32)));", v(insn.operands[1]));
break; break;
case PPC_INST_VCTUXS: case PPC_INST_VCTUXS:
@ -2001,15 +2025,15 @@ bool Recompiler::Recompile(
case PPC_INST_VCSXWFP128: case PPC_INST_VCSXWFP128:
{ {
printSetFlushMode(true); printSetFlushMode(true);
print("\t_mm_store_ps({}.f32, ", v(insn.operands[0])); print("\tsimde_mm_store_ps({}.f32, ", v(insn.operands[0]));
if (insn.operands[2] != 0) if (insn.operands[2] != 0)
{ {
const float value = ldexp(1.0f, -int32_t(insn.operands[2])); const float value = ldexp(1.0f, -int32_t(insn.operands[2]));
println("_mm_mul_ps(_mm_cvtepi32_ps(_mm_load_si128((__m128i*){}.u32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast<const uint32_t*>(&value)); println("simde_mm_mul_ps(simde_mm_cvtepi32_ps(simde_mm_load_si128((simde__m128i*){}.u32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast<const uint32_t*>(&value));
} }
else else
{ {
println("_mm_cvtepi32_ps(_mm_load_si128((__m128i*){}.u32)));", v(insn.operands[1])); println("simde_mm_cvtepi32_ps(simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[1]));
} }
break; break;
} }
@ -2018,15 +2042,15 @@ bool Recompiler::Recompile(
case PPC_INST_VCUXWFP128: case PPC_INST_VCUXWFP128:
{ {
printSetFlushMode(true); printSetFlushMode(true);
print("\t_mm_store_ps({}.f32, ", v(insn.operands[0])); print("\tsimde_mm_store_ps({}.f32, ", v(insn.operands[0]));
if (insn.operands[2] != 0) if (insn.operands[2] != 0)
{ {
const float value = ldexp(1.0f, -int32_t(insn.operands[2])); const float value = ldexp(1.0f, -int32_t(insn.operands[2]));
println("_mm_mul_ps(_mm_cvtepu32_ps_(_mm_load_si128((__m128i*){}.u32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast<const uint32_t*>(&value)); println("simde_mm_mul_ps(simde_mm_cvtepu32_ps_(simde_mm_load_si128((simde__m128i*){}.u32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast<const uint32_t*>(&value));
} }
else else
{ {
println("_mm_cvtepu32_ps_(_mm_load_si128((__m128i*){}.u32)));", v(insn.operands[1])); println("simde_mm_cvtepu32_ps_(simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[1]));
} }
break; break;
} }
@ -2039,15 +2063,15 @@ bool Recompiler::Recompile(
case PPC_INST_VCMPEQFP: case PPC_INST_VCMPEQFP:
case PPC_INST_VCMPEQFP128: case PPC_INST_VCMPEQFP128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_cmpeq_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpeq_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.')) if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
break; break;
case PPC_INST_VCMPEQUB: case PPC_INST_VCMPEQUB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpeq_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.')) if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); println("\t{}.setFromMask(simde_mm_load_si128((simde__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0]));
break; break;
case PPC_INST_VCMPEQUH: case PPC_INST_VCMPEQUH:
@ -2058,35 +2082,35 @@ bool Recompiler::Recompile(
case PPC_INST_VCMPEQUW: case PPC_INST_VCMPEQUW:
case PPC_INST_VCMPEQUW128: case PPC_INST_VCMPEQUW128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpeq_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.')) if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
break; break;
case PPC_INST_VCMPGEFP: case PPC_INST_VCMPGEFP:
case PPC_INST_VCMPGEFP128: case PPC_INST_VCMPGEFP128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_cmpge_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpge_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.')) if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
break; break;
case PPC_INST_VCMPGTFP: case PPC_INST_VCMPGTFP:
case PPC_INST_VCMPGTFP128: case PPC_INST_VCMPGTFP128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_cmpgt_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpgt_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.')) if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
break; break;
case PPC_INST_VCMPGTUB: case PPC_INST_VCMPGTUB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpgt_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.')) if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0]));
break; break;
case PPC_INST_VCMPGTUH: case PPC_INST_VCMPGTUH:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpgt_epu16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.')) if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0])); println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0]));
break; break;
@ -2123,13 +2147,13 @@ bool Recompiler::Recompile(
case PPC_INST_VMADDFP: case PPC_INST_VMADDFP:
case PPC_INST_VMADDFP128: case PPC_INST_VMADDFP128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_add_ps(_mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); println("\tsimde_mm_store_ps({}.f32, simde_mm_add_ps(simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3]));
break; break;
case PPC_INST_VMAXFP: case PPC_INST_VMAXFP:
case PPC_INST_VMAXFP128: case PPC_INST_VMAXFP128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_max_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_ps({}.f32, simde_mm_max_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VMAXSH: case PPC_INST_VMAXSH:
@ -2137,7 +2161,7 @@ bool Recompiler::Recompile(
break; break;
case PPC_INST_VMAXSW: case PPC_INST_VMAXSW:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_max_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_max_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VMINSH: case PPC_INST_VMINSH:
@ -2147,55 +2171,55 @@ bool Recompiler::Recompile(
case PPC_INST_VMINFP: case PPC_INST_VMINFP:
case PPC_INST_VMINFP128: case PPC_INST_VMINFP128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_min_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_ps({}.f32, simde_mm_min_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VMRGHB: case PPC_INST_VMRGHB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_unpackhi_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_unpackhi_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break; break;
case PPC_INST_VMRGHH: case PPC_INST_VMRGHH:
println("\t_mm_store_si128((__m128i*){}.u16, _mm_unpackhi_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_unpackhi_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break; break;
case PPC_INST_VMRGHW: case PPC_INST_VMRGHW:
case PPC_INST_VMRGHW128: case PPC_INST_VMRGHW128:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_unpackhi_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_unpackhi_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break; break;
case PPC_INST_VMRGLB: case PPC_INST_VMRGLB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_unpacklo_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_unpacklo_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break; break;
case PPC_INST_VMRGLH: case PPC_INST_VMRGLH:
println("\t_mm_store_si128((__m128i*){}.u16, _mm_unpacklo_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_unpacklo_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break; break;
case PPC_INST_VMRGLW: case PPC_INST_VMRGLW:
case PPC_INST_VMRGLW128: case PPC_INST_VMRGLW128:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_unpacklo_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_unpacklo_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break; break;
case PPC_INST_VMSUM3FP128: case PPC_INST_VMSUM3FP128:
// NOTE: accounting for full vector reversal here. should dot product yzw instead of xyz // NOTE: accounting for full vector reversal here. should dot product yzw instead of xyz
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_dp_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32), 0xEF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_ps({}.f32, simde_mm_dp_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32), 0xEF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VMSUM4FP128: case PPC_INST_VMSUM4FP128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_dp_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32), 0xFF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_ps({}.f32, simde_mm_dp_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32), 0xFF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VMULFP128: case PPC_INST_VMULFP128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_ps({}.f32, simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VNMSUBFP: case PPC_INST_VNMSUBFP:
case PPC_INST_VNMSUBFP128: case PPC_INST_VNMSUBFP128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_xor_ps(_mm_sub_ps(_mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)), _mm_load_ps({}.f32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x80000000)))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); println("\tsimde_mm_store_ps({}.f32, simde_mm_xor_ps(simde_mm_sub_ps(simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)), simde_mm_load_ps({}.f32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x80000000)))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3]));
break; break;
case PPC_INST_VNOR: case PPC_INST_VNOR:
@ -2210,18 +2234,18 @@ bool Recompiler::Recompile(
case PPC_INST_VOR: case PPC_INST_VOR:
case PPC_INST_VOR128: case PPC_INST_VOR128:
print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0])); print("\tsimde_mm_store_si128((simde__m128i*){}.u8, ", v(insn.operands[0]));
if (insn.operands[1] != insn.operands[2]) if (insn.operands[1] != insn.operands[2])
println("_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2])); println("simde_mm_or_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2]));
else else
println("_mm_load_si128((__m128i*){}.u8));", v(insn.operands[1])); println("simde_mm_load_si128((simde__m128i*){}.u8));", v(insn.operands[1]));
break; break;
case PPC_INST_VPERM: case PPC_INST_VPERM:
case PPC_INST_VPERM128: case PPC_INST_VPERM128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_perm_epi8_(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_perm_epi8_(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3]));
break; break;
case PPC_INST_VPERMWI128: case PPC_INST_VPERMWI128:
@ -2232,7 +2256,7 @@ bool Recompiler::Recompile(
uint32_t z = 3 - ((insn.operands[2] >> 4) & 0x3); uint32_t z = 3 - ((insn.operands[2] >> 4) & 0x3);
uint32_t w = 3 - ((insn.operands[2] >> 6) & 0x3); uint32_t w = 3 - ((insn.operands[2] >> 6) & 0x3);
uint32_t perm = x | (y << 2) | (z << 4) | (w << 6); uint32_t perm = x | (y << 2) | (z << 4) | (w << 6);
println("\t_mm_store_si128((__m128i*){}.u32, _mm_shuffle_epi32(_mm_load_si128((__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm); println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_shuffle_epi32(simde_mm_load_si128((simde__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm);
break; break;
} }
@ -2274,7 +2298,7 @@ bool Recompiler::Recompile(
case PPC_INST_VPKSHUS: case PPC_INST_VPKSHUS:
case PPC_INST_VPKSHUS128: case PPC_INST_VPKSHUS128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_packus_epi16(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break; break;
case PPC_INST_VPKSWUS: case PPC_INST_VPKSWUS:
@ -2296,31 +2320,31 @@ bool Recompiler::Recompile(
case PPC_INST_VREFP128: case PPC_INST_VREFP128:
// TODO: see if we can use rcp safely // TODO: see if we can use rcp safely
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_div_ps(_mm_set1_ps(1), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1])); println("\tsimde_mm_store_ps({}.f32, simde_mm_div_ps(simde_mm_set1_ps(1), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]));
break; break;
case PPC_INST_VRFIM: case PPC_INST_VRFIM:
case PPC_INST_VRFIM128: case PPC_INST_VRFIM128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1]));
break; break;
case PPC_INST_VRFIN: case PPC_INST_VRFIN:
case PPC_INST_VRFIN128: case PPC_INST_VRFIN128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1]));
break; break;
case PPC_INST_VRFIZ: case PPC_INST_VRFIZ:
case PPC_INST_VRFIZ128: case PPC_INST_VRFIZ128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1])); println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1]));
break; break;
case PPC_INST_VRLIMI128: case PPC_INST_VRLIMI128:
{ {
constexpr size_t shuffles[] = { _MM_SHUFFLE(3, 2, 1, 0), _MM_SHUFFLE(2, 1, 0, 3), _MM_SHUFFLE(1, 0, 3, 2), _MM_SHUFFLE(0, 3, 2, 1) }; constexpr size_t shuffles[] = { SIMDE_MM_SHUFFLE(3, 2, 1, 0), SIMDE_MM_SHUFFLE(2, 1, 0, 3), SIMDE_MM_SHUFFLE(1, 0, 3, 2), SIMDE_MM_SHUFFLE(0, 3, 2, 1) };
println("\t_mm_store_ps({}.f32, _mm_blend_ps(_mm_load_ps({}.f32), _mm_permute_ps(_mm_load_ps({}.f32), {}), {}));", v(insn.operands[0]), v(insn.operands[0]), v(insn.operands[1]), shuffles[insn.operands[3]], insn.operands[2]); println("\tsimde_mm_store_ps({}.f32, simde_mm_blend_ps(simde_mm_load_ps({}.f32), simde_mm_permute_ps(simde_mm_load_ps({}.f32), {}), {}));", v(insn.operands[0]), v(insn.operands[0]), v(insn.operands[1]), shuffles[insn.operands[3]], insn.operands[2]);
break; break;
} }
@ -2337,12 +2361,12 @@ bool Recompiler::Recompile(
// TODO: see if we can use rsqrt safely // TODO: see if we can use rsqrt safely
// TODO: we can detect if the input is from a dot product and apply logic only on one value // TODO: we can detect if the input is from a dot product and apply logic only on one value
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_div_ps(_mm_set1_ps(1), _mm_sqrt_ps(_mm_load_ps({}.f32))));", v(insn.operands[0]), v(insn.operands[1])); println("\tsimde_mm_store_ps({}.f32, simde_mm_div_ps(simde_mm_set1_ps(1), simde_mm_sqrt_ps(simde_mm_load_ps({}.f32))));", v(insn.operands[0]), v(insn.operands[1]));
break; break;
case PPC_INST_VSEL: case PPC_INST_VSEL:
case PPC_INST_VSEL128: case PPC_INST_VSEL128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_or_si128(_mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_or_si128(simde_mm_andnot_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)), simde_mm_and_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2]));
break; break;
case PPC_INST_VSLB: case PPC_INST_VSLB:
@ -2359,7 +2383,7 @@ bool Recompiler::Recompile(
case PPC_INST_VSLDOI: case PPC_INST_VSLDOI:
case PPC_INST_VSLDOI128: case PPC_INST_VSLDOI128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_alignr_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_alignr_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]);
break; break;
case PPC_INST_VSLW: case PPC_INST_VSLW:
@ -2373,7 +2397,7 @@ bool Recompiler::Recompile(
{ {
// NOTE: accounting for full vector reversal here // NOTE: accounting for full vector reversal here
uint32_t perm = 15 - insn.operands[2]; uint32_t perm = 15 - insn.operands[2];
println("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u8), _mm_set1_epi8(char(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_set1_epi8(char(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm);
break; break;
} }
@ -2382,12 +2406,12 @@ bool Recompiler::Recompile(
// NOTE: accounting for full vector reversal here // NOTE: accounting for full vector reversal here
uint32_t perm = 7 - insn.operands[2]; uint32_t perm = 7 - insn.operands[2];
perm = (perm * 2) | ((perm * 2 + 1) << 8); perm = (perm * 2) | ((perm * 2 + 1) << 8);
println("\t_mm_store_si128((__m128i*){}.u16, _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u16), _mm_set1_epi16(short(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm); println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_set1_epi16(short(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm);
break; break;
} }
case PPC_INST_VSPLTISB: case PPC_INST_VSPLTISB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]);
break; break;
case PPC_INST_VSPLTISH: case PPC_INST_VSPLTISH:
@ -2396,7 +2420,7 @@ bool Recompiler::Recompile(
case PPC_INST_VSPLTISW: case PPC_INST_VSPLTISW:
case PPC_INST_VSPLTISW128: case PPC_INST_VSPLTISW128:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]);
break; break;
case PPC_INST_VSPLTW: case PPC_INST_VSPLTW:
@ -2405,12 +2429,12 @@ bool Recompiler::Recompile(
// NOTE: accounting for full vector reversal here // NOTE: accounting for full vector reversal here
uint32_t perm = 3 - insn.operands[2]; uint32_t perm = 3 - insn.operands[2];
perm |= (perm << 2) | (perm << 4) | (perm << 6); perm |= (perm << 2) | (perm << 4) | (perm << 6);
println("\t_mm_store_si128((__m128i*){}.u32, _mm_shuffle_epi32(_mm_load_si128((__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm); println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_shuffle_epi32(simde_mm_load_si128((simde__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm);
break; break;
} }
case PPC_INST_VSR: case PPC_INST_VSR:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_vsr(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_vsr(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VSRAB: case PPC_INST_VSRAB:
@ -2448,7 +2472,7 @@ bool Recompiler::Recompile(
case PPC_INST_VSUBFP: case PPC_INST_VSUBFP:
case PPC_INST_VSUBFP128: case PPC_INST_VSUBFP128:
printSetFlushMode(true); printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_sub_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_ps({}.f32, simde_mm_sub_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VSUBSHS: case PPC_INST_VSUBSHS:
@ -2470,7 +2494,7 @@ bool Recompiler::Recompile(
break; break;
case PPC_INST_VSUBUBS: case PPC_INST_VSUBUBS:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_subs_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_subs_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break; break;
case PPC_INST_VSUBUBM: case PPC_INST_VSUBUBM:
@ -2515,32 +2539,32 @@ bool Recompiler::Recompile(
case PPC_INST_VUPKHSB: case PPC_INST_VUPKHSB:
case PPC_INST_VUPKHSB128: case PPC_INST_VUPKHSB128:
println("\t_mm_store_si128((__m128i*){}.s16, _mm_cvtepi8_epi16(_mm_unpackhi_epi64(_mm_load_si128((__m128i*){}.s8), _mm_load_si128((__m128i*){}.s8))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.s16, simde_mm_cvtepi8_epi16(simde_mm_unpackhi_epi64(simde_mm_load_si128((simde__m128i*){}.s8), simde_mm_load_si128((simde__m128i*){}.s8))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1]));
break; break;
case PPC_INST_VUPKHSH: case PPC_INST_VUPKHSH:
case PPC_INST_VUPKHSH128: case PPC_INST_VUPKHSH128:
println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi16_epi32(_mm_unpackhi_epi64(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi16_epi32(simde_mm_unpackhi_epi64(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1]));
break; break;
case PPC_INST_VUPKLSB: case PPC_INST_VUPKLSB:
case PPC_INST_VUPKLSB128: case PPC_INST_VUPKLSB128:
println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi8_epi16(_mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi8_epi16(simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]));
break; break;
case PPC_INST_VUPKLSH: case PPC_INST_VUPKLSH:
case PPC_INST_VUPKLSH128: case PPC_INST_VUPKLSH128:
println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi16_epi32(_mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1])); println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi16_epi32(simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]));
break; break;
case PPC_INST_VXOR: case PPC_INST_VXOR:
case PPC_INST_VXOR128: case PPC_INST_VXOR128:
print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0])); print("\tsimde_mm_store_si128((simde__m128i*){}.u8, ", v(insn.operands[0]));
if (insn.operands[1] != insn.operands[2]) if (insn.operands[1] != insn.operands[2])
println("_mm_xor_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2])); println("simde_mm_xor_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2]));
else else
println("_mm_setzero_si128());"); println("simde_mm_setzero_si128());");
break; break;
@ -2676,7 +2700,10 @@ bool Recompiler::Recompile(const Function& fn)
name = fmt::format("sub_{}", fn.base); name = fmt::format("sub_{}", fn.base);
} }
#ifdef XENON_RECOMP_USE_ALIAS
println("__attribute__((alias(\"__imp__{}\"))) PPC_WEAK_FUNC({});", name, name); println("__attribute__((alias(\"__imp__{}\"))) PPC_WEAK_FUNC({});", name, name);
#endif
println("PPC_FUNC_IMPL(__imp__{}) {{", name); println("PPC_FUNC_IMPL(__imp__{}) {{", name);
println("\tPPC_FUNC_PROLOGUE();"); println("\tPPC_FUNC_PROLOGUE();");
@ -2737,6 +2764,12 @@ bool Recompiler::Recompile(const Function& fn)
println("}}\n"); println("}}\n");
#ifndef XENON_RECOMP_USE_ALIAS
println("PPC_WEAK_FUNC({}) {{", name);
println("\t__imp__{}(ctx, base);", name);
println("}}\n");
#endif
std::swap(out, tempString); std::swap(out, tempString);
if (localVariables.ctr) if (localVariables.ctr)
println("\tPPCRegister ctr{{}};"); println("\tPPCRegister ctr{{}};");

View file

@ -38,6 +38,15 @@ void RecompilerConfig::Load(const std::string_view& configFilePath)
longJmpAddress = main["longjmp_address"].value_or(0u); longJmpAddress = main["longjmp_address"].value_or(0u);
setJmpAddress = main["setjmp_address"].value_or(0u); setJmpAddress = main["setjmp_address"].value_or(0u);
if (restGpr14Address == 0) fmt::println("ERROR: __restgprlr_14 address is unspecified");
if (saveGpr14Address == 0) fmt::println("ERROR: __savegprlr_14 address is unspecified");
if (restFpr14Address == 0) fmt::println("ERROR: __restfpr_14 address is unspecified");
if (saveFpr14Address == 0) fmt::println("ERROR: __savefpr_14 address is unspecified");
if (restVmx14Address == 0) fmt::println("ERROR: __restvmx_14 address is unspecified");
if (saveVmx14Address == 0) fmt::println("ERROR: __savevmx_14 address is unspecified");
if (restVmx64Address == 0) fmt::println("ERROR: __restvmx_64 address is unspecified");
if (saveVmx64Address == 0) fmt::println("ERROR: __savevmx_64 address is unspecified");
if (auto functionsArray = main["functions"].as_array()) if (auto functionsArray = main["functions"].as_array())
{ {
for (auto& func : *functionsArray) for (auto& func : *functionsArray)

View file

@ -17,8 +17,9 @@ target_compile_definitions(XenonUtils
) )
target_include_directories(XenonUtils target_include_directories(XenonUtils
PUBLIC PUBLIC
. .
"${THIRDPARTY_ROOT}/simde"
PRIVATE PRIVATE
"${THIRDPARTY_ROOT}/libmspack/libmspack/mspack" "${THIRDPARTY_ROOT}/libmspack/libmspack/mspack"
"${THIRDPARTY_ROOT}/tiny-AES-c" "${THIRDPARTY_ROOT}/tiny-AES-c"

View file

@ -2,7 +2,6 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <set> #include <set>
#include <expected>
#include <section.h> #include <section.h>
#include "symbol_table.h" #include "symbol_table.h"

View file

@ -12,13 +12,13 @@
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include <x86intrin.h> #include <x86/avx.h>
#include <x86/sse.h>
#include <x86/sse4.1.h>
#ifdef _WIN32 // SSE3 constants are missing from simde
#include <intrin.h> #ifndef _MM_DENORMALS_ZERO_MASK
#else #define _MM_DENORMALS_ZERO_MASK 0x0040
#include <xmmintrin.h>
#include <smmintrin.h>
#endif #endif
#define PPC_JOIN(x, y) x##y #define PPC_JOIN(x, y) x##y
@ -175,18 +175,18 @@ struct PPCCRRegister
eq = !un && (left == right); eq = !un && (left == right);
} }
inline void setFromMask(__m128 mask, int imm) noexcept inline void setFromMask(simde__m128 mask, int imm) noexcept
{ {
int m = _mm_movemask_ps(mask); int m = simde_mm_movemask_ps(mask);
lt = m == imm; // all equal lt = m == imm; // all equal
gt = 0; gt = 0;
eq = m == 0; // none equal eq = m == 0; // none equal
so = 0; so = 0;
} }
inline void setFromMask(__m128i mask, int imm) noexcept inline void setFromMask(simde__m128i mask, int imm) noexcept
{ {
int m = _mm_movemask_epi8(mask); int m = simde_mm_movemask_epi8(mask);
lt = m == imm; // all equal lt = m == imm; // all equal
gt = 0; gt = 0;
eq = m == 0; // none equal eq = m == 0; // none equal
@ -221,34 +221,71 @@ struct PPCFPSCRRegister
{ {
uint32_t csr; uint32_t csr;
static constexpr size_t GuestToHost[] = { _MM_ROUND_NEAREST, _MM_ROUND_TOWARD_ZERO, _MM_ROUND_UP, _MM_ROUND_DOWN };
static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO }; static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO };
// simde does not handle denormal flags, so we need to implement per-arch.
#if defined(__x86_64__) || defined(_M_X64)
static constexpr size_t RoundShift = 13;
static constexpr size_t RoundMask = SIMDE_MM_ROUND_MASK;
static constexpr size_t FlushMask = SIMDE_MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
static constexpr size_t GuestToHost[] = { SIMDE_MM_ROUND_NEAREST, SIMDE_MM_ROUND_TOWARD_ZERO, SIMDE_MM_ROUND_UP, SIMDE_MM_ROUND_DOWN };
inline uint32_t getcsr() noexcept
{
return simde_mm_getcsr();
}
inline void setcsr(uint32_t csr) noexcept
{
simde_mm_setcsr(csr);
}
#elif defined(__aarch64__) || defined(_M_ARM64)
// RMode
static constexpr size_t RoundShift = 22;
static constexpr size_t RoundMask = 3 << RoundShift;
// FZ and FZ16
static constexpr size_t FlushMask = (1 << 19) | (1 << 24);
// Nearest, Zero, -Infinity, -Infinity
static constexpr size_t GuestToHost[] = { 0 << RoundShift, 3 << RoundShift, 1 << RoundShift, 2 << RoundShift };
inline uint32_t getcsr() noexcept
{
uint64_t csr;
__asm__ __volatile__("mrs %0, fpcr" : "=r"(csr));
return csr;
}
inline void setcsr(uint32_t csr) noexcept
{
__asm__ __volatile__("msr fpcr, %0" : : "r"(csr));
}
#else
# error "Missing implementation for FPSCR."
#endif
inline uint32_t loadFromHost() noexcept inline uint32_t loadFromHost() noexcept
{ {
csr = _mm_getcsr(); csr = getcsr();
return HostToGuest[(csr & _MM_ROUND_MASK) >> 13]; return HostToGuest[(csr & RoundMask) >> RoundShift];
} }
inline void storeFromGuest(uint32_t value) noexcept inline void storeFromGuest(uint32_t value) noexcept
{ {
csr &= ~_MM_ROUND_MASK; csr &= ~RoundMask;
csr |= GuestToHost[value & PPC_ROUND_MASK]; csr |= GuestToHost[value & PPC_ROUND_MASK];
_mm_setcsr(csr); setcsr(csr);
} }
static constexpr size_t FlushMask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
inline void enableFlushModeUnconditional() noexcept inline void enableFlushModeUnconditional() noexcept
{ {
csr |= FlushMask; csr |= FlushMask;
_mm_setcsr(csr); setcsr(csr);
} }
inline void disableFlushModeUnconditional() noexcept inline void disableFlushModeUnconditional() noexcept
{ {
csr &= ~FlushMask; csr &= ~FlushMask;
_mm_setcsr(csr); setcsr(csr);
} }
inline void enableFlushMode() noexcept inline void enableFlushMode() noexcept
@ -256,7 +293,7 @@ struct PPCFPSCRRegister
if ((csr & FlushMask) != FlushMask) [[unlikely]] if ((csr & FlushMask) != FlushMask) [[unlikely]]
{ {
csr |= FlushMask; csr |= FlushMask;
_mm_setcsr(csr); setcsr(csr);
} }
} }
@ -265,7 +302,7 @@ struct PPCFPSCRRegister
if ((csr & FlushMask) != 0) [[unlikely]] if ((csr & FlushMask) != 0) [[unlikely]]
{ {
csr &= ~FlushMask; csr &= ~FlushMask;
_mm_setcsr(csr); setcsr(csr);
} }
} }
}; };
@ -593,81 +630,94 @@ inline uint8_t VectorShiftTableR[] =
0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
}; };
inline __m128i _mm_adds_epu32(__m128i a, __m128i b) inline simde__m128i simde_mm_adds_epu32(simde__m128i a, simde__m128i b)
{ {
return _mm_add_epi32(a, _mm_min_epu32(_mm_xor_si128(a, _mm_cmpeq_epi32(a, a)), b)); return simde_mm_add_epi32(a, simde_mm_min_epu32(simde_mm_xor_si128(a, simde_mm_cmpeq_epi32(a, a)), b));
} }
inline __m128i _mm_avg_epi8(__m128i a, __m128i b) inline simde__m128i simde_mm_avg_epi8(simde__m128i a, simde__m128i b)
{ {
__m128i c = _mm_set1_epi8(char(128)); simde__m128i c = simde_mm_set1_epi8(char(128));
return _mm_xor_si128(c, _mm_avg_epu8(_mm_xor_si128(c, a), _mm_xor_si128(c, b))); return simde_mm_xor_si128(c, simde_mm_avg_epu8(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
} }
inline __m128i _mm_avg_epi16(__m128i a, __m128i b) inline simde__m128i simde_mm_avg_epi16(simde__m128i a, simde__m128i b)
{ {
__m128i c = _mm_set1_epi16(short(32768)); simde__m128i c = simde_mm_set1_epi16(short(32768));
return _mm_xor_si128(c, _mm_avg_epu16(_mm_xor_si128(c, a), _mm_xor_si128(c, b))); return simde_mm_xor_si128(c, simde_mm_avg_epu16(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
} }
inline __m128 _mm_cvtepu32_ps_(__m128i src1) inline simde__m128 simde_mm_cvtepu32_ps_(simde__m128i src1)
{ {
__m128i xmm1 = _mm_add_epi32(src1, _mm_set1_epi32(127)); simde__m128i xmm1 = simde_mm_add_epi32(src1, simde_mm_set1_epi32(127));
__m128i xmm0 = _mm_slli_epi32(src1, 31 - 8); simde__m128i xmm0 = simde_mm_slli_epi32(src1, 31 - 8);
xmm0 = _mm_srli_epi32(xmm0, 31); xmm0 = simde_mm_srli_epi32(xmm0, 31);
xmm0 = _mm_add_epi32(xmm0, xmm1); xmm0 = simde_mm_add_epi32(xmm0, xmm1);
xmm0 = _mm_srai_epi32(xmm0, 8); xmm0 = simde_mm_srai_epi32(xmm0, 8);
xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(0x4F800000)); xmm0 = simde_mm_add_epi32(xmm0, simde_mm_set1_epi32(0x4F800000));
__m128 xmm2 = _mm_cvtepi32_ps(src1); simde__m128 xmm2 = simde_mm_cvtepi32_ps(src1);
return _mm_blendv_ps(xmm2, _mm_castsi128_ps(xmm0), _mm_castsi128_ps(src1)); return simde_mm_blendv_ps(xmm2, simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(src1));
} }
inline __m128i _mm_perm_epi8_(__m128i a, __m128i b, __m128i c) inline simde__m128i simde_mm_perm_epi8_(simde__m128i a, simde__m128i b, simde__m128i c)
{ {
__m128i d = _mm_set1_epi8(0xF); simde__m128i d = simde_mm_set1_epi8(0xF);
__m128i e = _mm_sub_epi8(d, _mm_and_si128(c, d)); simde__m128i e = simde_mm_sub_epi8(d, simde_mm_and_si128(c, d));
return _mm_blendv_epi8(_mm_shuffle_epi8(a, e), _mm_shuffle_epi8(b, e), _mm_slli_epi32(c, 3)); return simde_mm_blendv_epi8(simde_mm_shuffle_epi8(a, e), simde_mm_shuffle_epi8(b, e), simde_mm_slli_epi32(c, 3));
} }
inline __m128i _mm_cmpgt_epu8(__m128i a, __m128i b) inline simde__m128i simde_mm_cmpgt_epu8(simde__m128i a, simde__m128i b)
{ {
__m128i c = _mm_set1_epi8(char(128)); simde__m128i c = simde_mm_set1_epi8(char(128));
return _mm_cmpgt_epi8(_mm_xor_si128(a, c), _mm_xor_si128(b, c)); return simde_mm_cmpgt_epi8(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
} }
inline __m128i _mm_cmpgt_epu16(__m128i a, __m128i b) inline simde__m128i simde_mm_cmpgt_epu16(simde__m128i a, simde__m128i b)
{ {
__m128i c = _mm_set1_epi16(short(32768)); simde__m128i c = simde_mm_set1_epi16(short(32768));
return _mm_cmpgt_epi16(_mm_xor_si128(a, c), _mm_xor_si128(b, c)); return simde_mm_cmpgt_epi16(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
} }
inline __m128i _mm_vctsxs(__m128 src1) inline simde__m128i simde_mm_vctsxs(simde__m128 src1)
{ {
__m128 xmm2 = _mm_cmpunord_ps(src1, src1); simde__m128 xmm2 = simde_mm_cmpunord_ps(src1, src1);
__m128i xmm0 = _mm_cvttps_epi32(src1); simde__m128i xmm0 = simde_mm_cvttps_epi32(src1);
__m128i xmm1 = _mm_cmpeq_epi32(xmm0, _mm_set1_epi32(INT_MIN)); simde__m128i xmm1 = simde_mm_cmpeq_epi32(xmm0, simde_mm_set1_epi32(INT_MIN));
xmm1 = _mm_andnot_si128(_mm_castps_si128(src1), xmm1); xmm1 = simde_mm_andnot_si128(simde_mm_castps_si128(src1), xmm1);
__m128 dest = _mm_blendv_ps(_mm_castsi128_ps(xmm0), _mm_castsi128_ps(_mm_set1_epi32(INT_MAX)), _mm_castsi128_ps(xmm1)); simde__m128 dest = simde_mm_blendv_ps(simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(simde_mm_set1_epi32(INT_MAX)), simde_mm_castsi128_ps(xmm1));
return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest)); return simde_mm_andnot_si128(simde_mm_castps_si128(xmm2), simde_mm_castps_si128(dest));
} }
inline __m128i _mm_vctuxs(__m128 src1) inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b)
{ {
__m128 xmm0 = _mm_max_ps(src1, _mm_set1_epi32(0)); b = simde_mm_srli_epi64(simde_mm_slli_epi64(b, 61), 61);
__m128 xmm1 = _mm_cmpge_ps(xmm0, _mm_set1_ps((float)0x80000000)); return simde_mm_castps_si128(simde_mm_insert_ps(simde_mm_castsi128_ps(simde_mm_srl_epi64(a, b)), simde_mm_castsi128_ps(simde_mm_srl_epi64(simde_mm_srli_si128(a, 4), b)), 0x10));
__m128 xmm2 = _mm_sub_ps(xmm0, _mm_set1_ps((float)0x80000000));
xmm0 = _mm_blendv_ps(xmm0, xmm2, xmm1);
__m128i dest = _mm_cvttps_epi32(xmm0);
xmm0 = _mm_cmpeq_epi32(dest, _mm_set1_epi32(INT_MIN));
xmm1 = _mm_and_si128(xmm1, _mm_set1_epi32(INT_MIN));
dest = _mm_add_epi32(dest, xmm1);
return _mm_or_si128(dest, xmm0);
} }
inline __m128i _mm_vsr(__m128i a, __m128i b) inline simde__m128i simde_mm_vctuxs(simde__m128 src1)
{ {
b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61); simde__m128 xmm0 = simde_mm_max_ps(src1, simde_mm_set1_epi32(0));
return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10)); simde__m128 xmm1 = simde_mm_cmpge_ps(xmm0, simde_mm_set1_ps((float)0x80000000));
simde__m128 xmm2 = simde_mm_sub_ps(xmm0, simde_mm_set1_ps((float)0x80000000));
xmm0 = simde_mm_blendv_ps(xmm0, xmm2, xmm1);
simde__m128i dest = simde_mm_cvttps_epi32(xmm0);
xmm0 = simde_mm_cmpeq_epi32(dest, simde_mm_set1_epi32(INT_MIN));
xmm1 = simde_mm_and_si128(xmm1, simde_mm_set1_epi32(INT_MIN));
dest = simde_mm_add_epi32(dest, xmm1);
return simde_mm_or_si128(dest, xmm0);
} }
inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b)
#if defined(__aarch64__) || defined(_M_ARM64)
inline uint64_t __rdtsc()
{
uint64_t ret;
asm volatile("mrs %0, cntvct_el0\n\t"
: "=r"(ret)::"memory");
return ret;
}
#elif !defined(__x86_64__) && !defined(_M_X64)
# error "Missing implementation for __rdtsc()"
#endif
#endif #endif

1
thirdparty/simde vendored Submodule

@ -0,0 +1 @@
Subproject commit a532a12ca9bbdc5e6547eb602e6256b71a5377d4