Compare commits

...

11 commits

Author SHA1 Message Date
Jillian To
f2a92cf8fc
Merge 8fc280bed9 into c5bfd90d87 2025-08-04 07:17:28 +03:00
Isaac Marovitz
c5bfd90d87
Add simde (#22)
* Inital add SIMD-E

Signed-off-by: Isaac Marovitz <isaacryu@icloud.com>

* Include simde in ppc_context.h

Signed-off-by: Isaac Marovitz <isaacryu@icloud.com>

* Fix missing defines, don’t use native aliases

Signed-off-by: Isaac Marovitz <isaacryu@icloud.com>

* Update recompiler

Signed-off-by: Isaac Marovitz <isaacryu@icloud.com>

* Correct fround constants

Signed-off-by: Isaac Marovitz <isaacryu@icloud.com>

* Implement __rdtsc and FPCSR for ARM64.

* Remove static

Signed-off-by: Isaac Marovitz <isaacryu@icloud.com>

---------

Signed-off-by: Isaac Marovitz <isaacryu@icloud.com>
Co-authored-by: squidbus <175574877+squidbus@users.noreply.github.com>
2025-08-03 17:29:25 +03:00
JillianTo
8fc280bed9 use raw strings to avoid escape character syntax errors in python 3.12 2025-03-11 15:54:36 -04:00
JillianTo
4452868029 When checking for subroutine, don't make .text have to be at the beginning of the line, this accounts for some weird HTML formatting 2025-03-09 20:53:46 -04:00
JillianTo
9a4dc311c7 fix name in script 2025-03-09 19:12:39 -04:00
JillianTo
444ee2bda1 don't run duplicate check during debug because we allow duplicates during that, rename file 2025-03-09 19:10:49 -04:00
JillianTo
2365f4d697 missing 3 functions, but otherwise correct 2025-03-09 17:56:37 -04:00
JillianTo
21f1a81aa3 missing 3 functions, 0x82F08730 starts at wrong address 2025-03-09 12:53:48 -04:00
JillianTo
6dbbc6ea14 only gets address/size of 0x82893088 and size of 0x82CF7080 wrong 2025-03-08 21:22:26 -05:00
JillianTo
fe3fdbdda5 fixed typo 2025-03-08 13:00:48 -05:00
JillianTo
b18a1a6206 mostly correct 2025-03-08 12:47:45 -05:00
9 changed files with 507 additions and 150 deletions

3
.gitmodules vendored
View file

@ -13,3 +13,6 @@
[submodule "thirdparty/tiny-AES-c"]
path = thirdparty/tiny-AES-c
url = https://github.com/kokke/tiny-AES-c.git
[submodule "thirdparty/simde"]
path = thirdparty/simde
url = https://github.com/simd-everywhere/simde-no-tests.git

297
Auto_Function_Parser.py Normal file
View file

@ -0,0 +1,297 @@
##
## Searches for functions in .text that are referenced by functions in .pdata
##
## Input:
## Decompiled code - Created in IDA Pro 9.0SP1 with File -> Produce File -> Create HTML File...
## CLI output from a XenonRecomp run - When trying to compile with XenonRecomp, use > to save the output from the terminal
##
## Output:
## XenonRecomp config - Function block for TOML to be inputted into XenonRecomp
##
import sys
import re
# Check if correct number of input arguments were given
if len(sys.argv) != 4:
sys.exit("Auto_Function_Parser.py [IDA HTML] [XenonRecomp log] [Output TOML]")
# Filepath input arguments
ida_html = sys.argv[1]
xenonrecomp_log = sys.argv[2]
output_file = sys.argv[3]
# Disable extra debug output
debug = False
##
## Parse XenonRecomp log
##
# The starting index of the erroneous switch statement address in the XenonRecomp log
switch_idx = 22
# Initialize list to store erroneous switch statement addresses
switch_addrs = []
print("Parsing XenonRecomp log...")
# Import each line of XenonRecomp log
with open(xenonrecomp_log, 'r') as file:
# Read each line in the file
for line in file:
# If this line describes an error, it has the address of a problematic switch statement
if re.search('ERROR: Switch case at ', line):
# Save the address as integer
switch_addrs.append(line[switch_idx:switch_idx+8])
# Save only unique addresses and sort
switch_addrs = set(switch_addrs)
##
## Parse IDA HTML
##
# Initialize list to store start and end of functions
functs = []
# Count how many functions have been added
num_functs = 0
# Function for adding to function list and incrementing count
def add_function(new_start_addr, prev_end_addr, start_type):
global num_functs
# If an end address for the last added function was specified
if prev_end_addr != None:
# Set end address for last added function
functs[num_functs-1][1] = prev_end_addr
# Add a new function to the list with the specified starting address
functs.append([new_start_addr, 0, [], start_type])
# Increment the number of functions
num_functs = num_functs+1
# Mark if we are in .text section
in_text = False
# Mark if we should end parsing
end_parse = False
# Initialize address of last bctr instruction to 0
bctr_addr = '00000000'
# Initialize address of last blr instruction to 0
blr_addr = '00000000'
# Initialize address of last 'End of function' comment to 0
eof_addr = '00000000'
# Initialize address of last restgprlr instruction to 0
restgprlr_addr = '00000000'
# Initialize address of last padding to 0
pad_addr = 0
# Import each line of decompiled code
print("Parsing IDA HTML...")
with open(ida_html, 'r') as file:
# Read each line in the file
for line in file:
if not end_parse:
# If in .text
if in_text:
# Get the current address
colon_idx = line.find(':')
curr_addr = line[colon_idx+1:colon_idx+9]
# Check if this is the start of a function
if re.search(r'\.text:'+curr_addr+' </span><span class="c[0-9]*">sub_'+curr_addr+'</span><span class="c[0-9]*">: *</span><span class="c[0-9]*"># [A-Z][A-Z][A-Z][A-Z] XREF:', line):
# Save current address as integer
curr_addr_int = int(curr_addr, 16)
# If this is not the first function being added
if num_functs > 0:
# If last address had padding or restgprlr instruction, then this function was already added
if curr_addr_int-4 == pad_addr or curr_addr_int-4 == restgprlr_addr:
# Set function type for start address
functs[num_functs-1][3] = 'sub'
else:
# Check if this function is part of latest added function
is_nested_funct = False
nested_functs = functs[num_functs-1][2]
for nested_funct in nested_functs:
if nested_funct == curr_addr:
is_nested_funct = True
break
# If last address was not padding and not nested in latest function
if not is_nested_funct:
# Add new function and last function's end address
add_function(curr_addr_int, curr_addr_int, 'sub')
else:
# Add new function
add_function(curr_addr_int, None, 'sub')
# If this is a location
elif re.search(r'^\.text:'+curr_addr+' </span><span class="c[0-9]*">loc_'+curr_addr, line):
curr_addr_int = int(curr_addr, 16)
curr_funct = functs[num_functs-1]
# If previous address was a blr instruction
if curr_addr_int-4 == int(blr_addr, 16):
# If previous address had an 'End of function' comment or if there was a bctr with the comment
if blr_addr == eof_addr or bctr_addr == eof_addr:
# Find a XREF pointing to a .text address
xref_idx = line.find('XREF: .text:')
if xref_idx > -1:
underscore_idx = line.find('_', xref_idx)
if underscore_idx > -1:
xref = line[underscore_idx+1:underscore_idx+9]
else:
xref = line[xref_idx+12:xref_idx+20]
else:
xref = None
# Couldn't find XREF pointing to .text address or the XREF is after this address
if xref == None or int(xref, 16) > curr_addr_int:
# Add as new function
add_function(curr_addr_int, curr_addr_int, 'loc')
else:
# Find address of function that references this
xref_idx = line.find('CODE XREF: sub_')
# If it was found
if xref_idx > -1:
# Store as nested function in latest function
functs[num_functs-1][2].append(line[xref_idx+15:xref_idx+23])
# Check if this line is padding
elif num_functs > 0 and re.search(r'<span class="c[0-9]*">\.long </span><span class="c[0-9]*">0$', line):
# Convert current address to integer
curr_addr_int = int(curr_addr, 16)
# Add a new function at the line after padding, and end the current function at this padding address
add_function(curr_addr_int+4, curr_addr_int, None)
# Save padding address
pad_addr = curr_addr_int
# Check for blr instruction
elif re.search('<span class="c[0-9]*">blr$', line):
blr_addr = curr_addr
# Check for 'End of function' comment
elif re.search('End of function ', line):
eof_addr = curr_addr
# Check for bctr instruction
elif re.search('<span class="c[0-9]*">bctr$', line):
bctr_addr = curr_addr
# Check for restgprlr instruction
elif re.search('<span class="c[0-9]*">b </span><span class="c[0-9]*">__restgprlr_[0-9][0-9]$', line):
# Convert current address to integer
curr_addr_int = int(curr_addr, 16)
# Add a new function at the line after restgprlr instruction, and end the current function at this address
add_function(curr_addr_int+4, curr_addr_int, None)
restgprlr_addr = curr_addr_int
# If not in .text
else:
# If .text section header found
if re.search(r'<span class="c[0-9]*">\.section &quot;\.text&quot;', line):
in_text = True
##
## Find .text functions that are referenced by .pdata functions
##
# Initialize list for functions that need to be added to toml
output_functs = []
# Look for related functions for every unique errored switch statement
print("Searching for needed functions...")
for switch_addr in switch_addrs:
# Start looking at first subroutine
curr_funct_idx = 0
# Save current switch statement address as integer
switch_addr_int = int(switch_addr, 16)
# The related function for this switch statement has not been found yet
search_for_funct = True
# Start search for function relating to switch statement
while(search_for_funct):
curr_funct = functs[curr_funct_idx]
# If switch address is after this function's start
curr_funct_start = curr_funct[0]
if(switch_addr_int > curr_funct_start):
# If switch address is before this function's end
curr_funct_end = curr_funct[1]
if(switch_addr_int <= curr_funct_end):
# Save current function's start address and the function's length
if debug:
output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start), switch_addr])
else:
output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start)])
# Don't need to continue search for this switch statement
search_for_funct = False
# Look in next function
curr_funct_idx = curr_funct_idx + 1
# Related function was not found
else:
print(f"WARNING: Function relating to {switch_addr} not found! Skipping.")
# Don't need to continue search for this switch statement
search_for_funct = False
# Remove duplicates
if not debug:
output_functs = list(set(tuple(funct) for funct in output_functs))
# Make sure there are no functions with the same starting address but different lengths
if not debug:
for i in range(len(output_functs)):
for j in range(i+1, len(output_functs)):
curr_funct_start = output_functs[i][0]
if curr_funct_start == output_functs[j][0]:
print(f"WARNING: {curr_funct_start} has multiple entries of different lengths, manually find correct one.")
print(f"{len(output_functs)} functions found!")
##
## Output all found functions to TOML in correct format
##
# Create formatted string to export to TOML
output_str = "functions = ["
# Append all function addresses and lengths to formatted string
print("Outputting to formatted file...")
for funct in output_functs:
# Format hex to uppercase
curr_funct_start = '0x'+funct[0][2:].upper()
curr_funct_end = '0x'+funct[1][2:].upper()
# Format function
curr_funct = "\n { address = "+curr_funct_start+", size = "+curr_funct_end
if debug:
curr_funct = curr_funct+", src = "+funct[2]
curr_funct = curr_funct+" },"
# Add to complete output string
output_str = output_str+curr_funct
# Delete last comma
output_str = output_str[:len(output_str)-1]
# Add last bracket
output_str = output_str+"\n]"
# Output to file
with open(output_file, "w") as file:
file.write(output_str)

View file

@ -190,6 +190,12 @@ functions = [
You can define function boundaries explicitly using the `functions` property if XenonAnalyse fails to analyze them correctly, for example, with functions containing jump tables.
You can automatically generate these using the FunctionParser.py script. You will need to create a HTML of your decompiled XEX with IDA using `File -> Produce File -> Create HTML File...` and save the terminal output from running XenonRecomp by appending `> [output log file path]` to the command.
```
python3 FunctionParser.py [input IDA HTML file path] [input XenonRecomp log file path] [output function list file path]
```
#### Invalid Instruction Skips
```toml

View file

@ -16,4 +16,4 @@
#include <xbox.h>
#include <xxhash.h>
#include <fmt/core.h>
#include <xmmintrin.h>
#include <x86/sse.h>

View file

@ -897,17 +897,17 @@ bool Recompiler::Recompile(
case PPC_INST_FCTID:
printSetFlushMode(false);
println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : _mm_cvtsd_si64(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1]));
println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : simde_mm_cvtsd_si64(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1]));
break;
case PPC_INST_FCTIDZ:
printSetFlushMode(false);
println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : _mm_cvttsd_si64(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1]));
println("\t{}.s64 = ({}.f64 > double(LLONG_MAX)) ? LLONG_MAX : simde_mm_cvttsd_si64(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1]));
break;
case PPC_INST_FCTIWZ:
printSetFlushMode(false);
println("\t{}.s64 = ({}.f64 > double(INT_MAX)) ? INT_MAX : _mm_cvttsd_si32(_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1]));
println("\t{}.s64 = ({}.f64 > double(INT_MAX)) ? INT_MAX : simde_mm_cvttsd_si32(simde_mm_load_sd(&{}.f64));", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[1]));
break;
case PPC_INST_FDIV:
@ -1139,10 +1139,10 @@ bool Recompiler::Recompile(
case PPC_INST_LVX128:
// NOTE: for endian swapping, we reverse the whole vector instead of individual elements.
// this is accounted for in every instruction (eg. dp3 sums yzw instead of xyz)
print("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ((", v(insn.operands[0]));
print("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ((", v(insn.operands[0]));
if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32) & ~0xF))), _mm_load_si128((__m128i*)VectorMaskL)));", r(insn.operands[2]));
println("{}.u32) & ~0xF))), simde_mm_load_si128((simde__m128i*)VectorMaskL)));", r(insn.operands[2]));
break;
case PPC_INST_LVLX:
@ -1151,7 +1151,7 @@ bool Recompiler::Recompile(
if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32;", r(insn.operands[2]));
println("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ({}.u32 & ~0xF))), _mm_load_si128((__m128i*)&VectorMaskL[({}.u32 & 0xF) * 16])));", v(insn.operands[0]), temp(), temp());
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ({}.u32 & ~0xF))), simde_mm_load_si128((simde__m128i*)&VectorMaskL[({}.u32 & 0xF) * 16])));", v(insn.operands[0]), temp(), temp());
break;
case PPC_INST_LVRX:
@ -1160,7 +1160,7 @@ bool Recompiler::Recompile(
if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32;", r(insn.operands[2]));
println("\t_mm_store_si128((__m128i*){}.u8, {}.u32 & 0xF ? _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ({}.u32 & ~0xF))), _mm_load_si128((__m128i*)&VectorMaskR[({}.u32 & 0xF) * 16])) : _mm_setzero_si128());", v(insn.operands[0]), temp(), temp(), temp());
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, {}.u32 & 0xF ? simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*)(base + ({}.u32 & ~0xF))), simde_mm_load_si128((simde__m128i*)&VectorMaskR[({}.u32 & 0xF) * 16])) : simde_mm_setzero_si128());", v(insn.operands[0]), temp(), temp(), temp());
break;
case PPC_INST_LVSL:
@ -1168,7 +1168,7 @@ bool Recompiler::Recompile(
if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32;", r(insn.operands[2]));
println("\t_mm_store_si128((__m128i*){}.u8, _mm_load_si128((__m128i*)&VectorShiftTableL[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp());
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_load_si128((simde__m128i*)&VectorShiftTableL[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp());
break;
case PPC_INST_LVSR:
@ -1176,7 +1176,7 @@ bool Recompiler::Recompile(
if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32;", r(insn.operands[2]));
println("\t_mm_store_si128((__m128i*){}.u8, _mm_load_si128((__m128i*)&VectorShiftTableR[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp());
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_load_si128((simde__m128i*)&VectorShiftTableR[({}.u32 & 0xF) * 16]));", v(insn.operands[0]), temp());
break;
case PPC_INST_LWA:
@ -1635,10 +1635,10 @@ bool Recompiler::Recompile(
case PPC_INST_STVX:
case PPC_INST_STVX128:
print("\t_mm_store_si128((__m128i*)(base + ((");
print("\tsimde_mm_store_si128((simde__m128i*)(base + ((");
if (insn.operands[1] != 0)
print("{}.u32 + ", r(insn.operands[1]));
println("{}.u32) & ~0xF)), _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*)VectorMaskL)));", r(insn.operands[2]), v(insn.operands[0]));
println("{}.u32) & ~0xF)), simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*)VectorMaskL)));", r(insn.operands[2]), v(insn.operands[0]));
break;
case PPC_INST_STW:
@ -1737,77 +1737,77 @@ bool Recompiler::Recompile(
case PPC_INST_VADDFP:
case PPC_INST_VADDFP128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_add_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_add_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VADDSHS:
println("\t_mm_store_si128((__m128i*){}.s16, _mm_adds_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.s16, simde_mm_adds_epi16(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VADDUBM:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_add_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_add_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VADDUBS:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_adds_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_adds_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VADDUHM:
println("\t_mm_store_si128((__m128i*){}.u16, _mm_add_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_add_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VADDUWM:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_add_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_add_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VADDUWS:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_adds_epu32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_adds_epu32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VAND:
case PPC_INST_VAND128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_and_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VANDC128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_andnot_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break;
case PPC_INST_VAVGSB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VAVGSH:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epi16(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epi16(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VAVGUB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_avg_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VCTSXS:
case PPC_INST_VCFPSXWS128:
printSetFlushMode(true);
print("\t_mm_store_si128((__m128i*){}.s32, _mm_vctsxs(", v(insn.operands[0]));
print("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_vctsxs(", v(insn.operands[0]));
if (insn.operands[2] != 0)
println("_mm_mul_ps(_mm_load_ps({}.f32), _mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]);
println("simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]);
else
println("_mm_load_ps({}.f32)));", v(insn.operands[1]));
println("simde_mm_load_ps({}.f32)));", v(insn.operands[1]));
break;
case PPC_INST_VCFSX:
case PPC_INST_VCSXWFP128:
{
printSetFlushMode(true);
print("\t_mm_store_ps({}.f32, ", v(insn.operands[0]));
print("\tsimde_mm_store_ps({}.f32, ", v(insn.operands[0]));
if (insn.operands[2] != 0)
{
const float value = ldexp(1.0f, -int32_t(insn.operands[2]));
println("_mm_mul_ps(_mm_cvtepi32_ps(_mm_load_si128((__m128i*){}.u32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast<const uint32_t*>(&value));
println("simde_mm_mul_ps(simde_mm_cvtepi32_ps(simde_mm_load_si128((simde__m128i*){}.u32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast<const uint32_t*>(&value));
}
else
{
println("_mm_cvtepi32_ps(_mm_load_si128((__m128i*){}.u32)));", v(insn.operands[1]));
println("simde_mm_cvtepi32_ps(simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[1]));
}
break;
}
@ -1816,15 +1816,15 @@ bool Recompiler::Recompile(
case PPC_INST_VCUXWFP128:
{
printSetFlushMode(true);
print("\t_mm_store_ps({}.f32, ", v(insn.operands[0]));
print("\tsimde_mm_store_ps({}.f32, ", v(insn.operands[0]));
if (insn.operands[2] != 0)
{
const float value = ldexp(1.0f, -int32_t(insn.operands[2]));
println("_mm_mul_ps(_mm_cvtepu32_ps_(_mm_load_si128((__m128i*){}.u32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast<const uint32_t*>(&value));
println("simde_mm_mul_ps(simde_mm_cvtepu32_ps_(simde_mm_load_si128((simde__m128i*){}.u32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x{:X})))));", v(insn.operands[1]), *reinterpret_cast<const uint32_t*>(&value));
}
else
{
println("_mm_cvtepu32_ps_(_mm_load_si128((__m128i*){}.u32)));", v(insn.operands[1]));
println("simde_mm_cvtepu32_ps_(simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[1]));
}
break;
}
@ -1837,46 +1837,46 @@ bool Recompiler::Recompile(
case PPC_INST_VCMPEQFP:
case PPC_INST_VCMPEQFP128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_cmpeq_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpeq_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
break;
case PPC_INST_VCMPEQUB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpeq_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0]));
println("\t{}.setFromMask(simde_mm_load_si128((simde__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0]));
break;
case PPC_INST_VCMPEQUW:
case PPC_INST_VCMPEQUW128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpeq_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
break;
case PPC_INST_VCMPGEFP:
case PPC_INST_VCMPGEFP128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_cmpge_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpge_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
break;
case PPC_INST_VCMPGTFP:
case PPC_INST_VCMPGTFP128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_cmpgt_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_cmpgt_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
if (strchr(insn.opcode->name, '.'))
println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
println("\t{}.setFromMask(simde_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
break;
case PPC_INST_VCMPGTUB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpgt_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VCMPGTUH:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_cmpgt_epu16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VEXPTEFP:
@ -1899,87 +1899,87 @@ bool Recompiler::Recompile(
case PPC_INST_VMADDFP:
case PPC_INST_VMADDFP128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_add_ps(_mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_add_ps(simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3]));
break;
case PPC_INST_VMAXFP:
case PPC_INST_VMAXFP128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_max_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_max_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VMAXSW:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_max_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_max_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VMINFP:
case PPC_INST_VMINFP128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_min_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_min_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VMRGHB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_unpackhi_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_unpackhi_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break;
case PPC_INST_VMRGHH:
println("\t_mm_store_si128((__m128i*){}.u16, _mm_unpackhi_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_unpackhi_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break;
case PPC_INST_VMRGHW:
case PPC_INST_VMRGHW128:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_unpackhi_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_unpackhi_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break;
case PPC_INST_VMRGLB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_unpacklo_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_unpacklo_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break;
case PPC_INST_VMRGLH:
println("\t_mm_store_si128((__m128i*){}.u16, _mm_unpacklo_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_unpacklo_epi16(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_load_si128((simde__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break;
case PPC_INST_VMRGLW:
case PPC_INST_VMRGLW128:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_unpacklo_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_unpacklo_epi32(simde_mm_load_si128((simde__m128i*){}.u32), simde_mm_load_si128((simde__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break;
case PPC_INST_VMSUM3FP128:
// NOTE: accounting for full vector reversal here. should dot product yzw instead of xyz
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_dp_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32), 0xEF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_dp_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32), 0xEF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VMSUM4FP128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_dp_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32), 0xFF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_dp_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32), 0xFF));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VMULFP128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VNMSUBFP:
case PPC_INST_VNMSUBFP128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_xor_ps(_mm_sub_ps(_mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)), _mm_load_ps({}.f32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x80000000)))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_xor_ps(simde_mm_sub_ps(simde_mm_mul_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)), simde_mm_load_ps({}.f32)), simde_mm_castsi128_ps(simde_mm_set1_epi32(int(0x80000000)))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3]));
break;
case PPC_INST_VOR:
case PPC_INST_VOR128:
print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0]));
print("\tsimde_mm_store_si128((simde__m128i*){}.u8, ", v(insn.operands[0]));
if (insn.operands[1] != insn.operands[2])
println("_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2]));
println("simde_mm_or_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2]));
else
println("_mm_load_si128((__m128i*){}.u8));", v(insn.operands[1]));
println("simde_mm_load_si128((simde__m128i*){}.u8));", v(insn.operands[1]));
break;
case PPC_INST_VPERM:
case PPC_INST_VPERM128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_perm_epi8_(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_perm_epi8_(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3]));
break;
case PPC_INST_VPERMWI128:
@ -1990,7 +1990,7 @@ bool Recompiler::Recompile(
uint32_t z = 3 - ((insn.operands[2] >> 4) & 0x3);
uint32_t w = 3 - ((insn.operands[2] >> 6) & 0x3);
uint32_t perm = x | (y << 2) | (z << 4) | (w << 6);
println("\t_mm_store_si128((__m128i*){}.u32, _mm_shuffle_epi32(_mm_load_si128((__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm);
println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_shuffle_epi32(simde_mm_load_si128((simde__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm);
break;
}
@ -2044,38 +2044,38 @@ bool Recompiler::Recompile(
case PPC_INST_VPKSHUS:
case PPC_INST_VPKSHUS128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_packus_epi16(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
break;
case PPC_INST_VREFP:
case PPC_INST_VREFP128:
// TODO: see if we can use rcp safely
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_div_ps(_mm_set1_ps(1), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_div_ps(simde_mm_set1_ps(1), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]));
break;
case PPC_INST_VRFIM:
case PPC_INST_VRFIM128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1]));
break;
case PPC_INST_VRFIN:
case PPC_INST_VRFIN128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1]));
break;
case PPC_INST_VRFIZ:
case PPC_INST_VRFIZ128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_round_ps(_mm_load_ps({}.f32), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_round_ps(simde_mm_load_ps({}.f32), SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_NO_EXC));", v(insn.operands[0]), v(insn.operands[1]));
break;
case PPC_INST_VRLIMI128:
{
constexpr size_t shuffles[] = { _MM_SHUFFLE(3, 2, 1, 0), _MM_SHUFFLE(2, 1, 0, 3), _MM_SHUFFLE(1, 0, 3, 2), _MM_SHUFFLE(0, 3, 2, 1) };
println("\t_mm_store_ps({}.f32, _mm_blend_ps(_mm_load_ps({}.f32), _mm_permute_ps(_mm_load_ps({}.f32), {}), {}));", v(insn.operands[0]), v(insn.operands[0]), v(insn.operands[1]), shuffles[insn.operands[3]], insn.operands[2]);
constexpr size_t shuffles[] = { SIMDE_MM_SHUFFLE(3, 2, 1, 0), SIMDE_MM_SHUFFLE(2, 1, 0, 3), SIMDE_MM_SHUFFLE(1, 0, 3, 2), SIMDE_MM_SHUFFLE(0, 3, 2, 1) };
println("\tsimde_mm_store_ps({}.f32, simde_mm_blend_ps(simde_mm_load_ps({}.f32), simde_mm_permute_ps(simde_mm_load_ps({}.f32), {}), {}));", v(insn.operands[0]), v(insn.operands[0]), v(insn.operands[1]), shuffles[insn.operands[3]], insn.operands[2]);
break;
}
@ -2084,11 +2084,11 @@ bool Recompiler::Recompile(
// TODO: see if we can use rsqrt safely
// TODO: we can detect if the input is from a dot product and apply logic only on one value
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_div_ps(_mm_set1_ps(1), _mm_sqrt_ps(_mm_load_ps({}.f32))));", v(insn.operands[0]), v(insn.operands[1]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_div_ps(simde_mm_set1_ps(1), simde_mm_sqrt_ps(simde_mm_load_ps({}.f32))));", v(insn.operands[0]), v(insn.operands[1]));
break;
case PPC_INST_VSEL:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_or_si128(_mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_or_si128(simde_mm_andnot_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)), simde_mm_and_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2]));
break;
case PPC_INST_VSLB:
@ -2099,7 +2099,7 @@ bool Recompiler::Recompile(
case PPC_INST_VSLDOI:
case PPC_INST_VSLDOI128:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_alignr_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]);
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_alignr_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]);
break;
case PPC_INST_VSLW:
@ -2113,7 +2113,7 @@ bool Recompiler::Recompile(
{
// NOTE: accounting for full vector reversal here
uint32_t perm = 15 - insn.operands[2];
println("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u8), _mm_set1_epi8(char(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm);
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_set1_epi8(char(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm);
break;
}
@ -2122,17 +2122,17 @@ bool Recompiler::Recompile(
// NOTE: accounting for full vector reversal here
uint32_t perm = 7 - insn.operands[2];
perm = (perm * 2) | ((perm * 2 + 1) << 8);
println("\t_mm_store_si128((__m128i*){}.u16, _mm_shuffle_epi8(_mm_load_si128((__m128i*){}.u16), _mm_set1_epi16(short(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm);
println("\tsimde_mm_store_si128((simde__m128i*){}.u16, simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i*){}.u16), simde_mm_set1_epi16(short(0x{:X}))));", v(insn.operands[0]), v(insn.operands[1]), perm);
break;
}
case PPC_INST_VSPLTISB:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]);
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]);
break;
case PPC_INST_VSPLTISW:
case PPC_INST_VSPLTISW128:
println("\t_mm_store_si128((__m128i*){}.u32, _mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]);
println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]);
break;
case PPC_INST_VSPLTW:
@ -2141,12 +2141,12 @@ bool Recompiler::Recompile(
// NOTE: accounting for full vector reversal here
uint32_t perm = 3 - insn.operands[2];
perm |= (perm << 2) | (perm << 4) | (perm << 6);
println("\t_mm_store_si128((__m128i*){}.u32, _mm_shuffle_epi32(_mm_load_si128((__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm);
println("\tsimde_mm_store_si128((simde__m128i*){}.u32, simde_mm_shuffle_epi32(simde_mm_load_si128((simde__m128i*){}.u32), 0x{:X}));", v(insn.operands[0]), v(insn.operands[1]), perm);
break;
}
case PPC_INST_VSR:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_vsr(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_vsr(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VSRAW:
@ -2166,7 +2166,7 @@ bool Recompiler::Recompile(
case PPC_INST_VSUBFP:
case PPC_INST_VSUBFP128:
printSetFlushMode(true);
println("\t_mm_store_ps({}.f32, _mm_sub_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_ps({}.f32, simde_mm_sub_ps(simde_mm_load_ps({}.f32), simde_mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VSUBSWS:
@ -2179,11 +2179,11 @@ bool Recompiler::Recompile(
break;
case PPC_INST_VSUBUBS:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_subs_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_subs_epu8(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VSUBUHM:
println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi16(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
println("\tsimde_mm_store_si128((simde__m128i*){}.u8, simde_mm_sub_epi16(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
break;
case PPC_INST_VUPKD3D128:
@ -2220,32 +2220,32 @@ bool Recompiler::Recompile(
case PPC_INST_VUPKHSB:
case PPC_INST_VUPKHSB128:
println("\t_mm_store_si128((__m128i*){}.s16, _mm_cvtepi8_epi16(_mm_unpackhi_epi64(_mm_load_si128((__m128i*){}.s8), _mm_load_si128((__m128i*){}.s8))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.s16, simde_mm_cvtepi8_epi16(simde_mm_unpackhi_epi64(simde_mm_load_si128((simde__m128i*){}.s8), simde_mm_load_si128((simde__m128i*){}.s8))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1]));
break;
case PPC_INST_VUPKHSH:
case PPC_INST_VUPKHSH128:
println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi16_epi32(_mm_unpackhi_epi64(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi16_epi32(simde_mm_unpackhi_epi64(simde_mm_load_si128((simde__m128i*){}.s16), simde_mm_load_si128((simde__m128i*){}.s16))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[1]));
break;
case PPC_INST_VUPKLSB:
case PPC_INST_VUPKLSB128:
println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi8_epi16(_mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi8_epi16(simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]));
break;
case PPC_INST_VUPKLSH:
case PPC_INST_VUPKLSH128:
println("\t_mm_store_si128((__m128i*){}.s32, _mm_cvtepi16_epi32(_mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]));
println("\tsimde_mm_store_si128((simde__m128i*){}.s32, simde_mm_cvtepi16_epi32(simde_mm_load_si128((simde__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]));
break;
case PPC_INST_VXOR:
case PPC_INST_VXOR128:
print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0]));
print("\tsimde_mm_store_si128((simde__m128i*){}.u8, ", v(insn.operands[0]));
if (insn.operands[1] != insn.operands[2])
println("_mm_xor_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2]));
println("simde_mm_xor_si128(simde_mm_load_si128((simde__m128i*){}.u8), simde_mm_load_si128((simde__m128i*){}.u8)));", v(insn.operands[1]), v(insn.operands[2]));
else
println("_mm_setzero_si128());");
println("simde_mm_setzero_si128());");
break;

View file

@ -17,8 +17,9 @@ target_compile_definitions(XenonUtils
)
target_include_directories(XenonUtils
PUBLIC
PUBLIC
.
"${THIRDPARTY_ROOT}/simde"
PRIVATE
"${THIRDPARTY_ROOT}/libmspack/libmspack/mspack"
"${THIRDPARTY_ROOT}/tiny-AES-c"

View file

@ -12,13 +12,13 @@
#include <cstdlib>
#include <cstring>
#include <x86intrin.h>
#include <x86/avx.h>
#include <x86/sse.h>
#include <x86/sse4.1.h>
#ifdef _WIN32
#include <intrin.h>
#else
#include <xmmintrin.h>
#include <smmintrin.h>
// SSE3 constants are missing from simde
#ifndef _MM_DENORMALS_ZERO_MASK
#define _MM_DENORMALS_ZERO_MASK 0x0040
#endif
#define PPC_JOIN(x, y) x##y
@ -172,18 +172,18 @@ struct PPCCRRegister
eq = !un && (left == right);
}
inline void setFromMask(__m128 mask, int imm) noexcept
inline void setFromMask(simde__m128 mask, int imm) noexcept
{
int m = _mm_movemask_ps(mask);
int m = simde_mm_movemask_ps(mask);
lt = m == imm; // all equal
gt = 0;
eq = m == 0; // none equal
so = 0;
}
inline void setFromMask(__m128i mask, int imm) noexcept
inline void setFromMask(simde__m128i mask, int imm) noexcept
{
int m = _mm_movemask_epi8(mask);
int m = simde_mm_movemask_epi8(mask);
lt = m == imm; // all equal
gt = 0;
eq = m == 0; // none equal
@ -215,34 +215,71 @@ struct PPCFPSCRRegister
{
uint32_t csr;
static constexpr size_t GuestToHost[] = { _MM_ROUND_NEAREST, _MM_ROUND_TOWARD_ZERO, _MM_ROUND_UP, _MM_ROUND_DOWN };
static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO };
// simde does not handle denormal flags, so we need to implement per-arch.
#if defined(__x86_64__) || defined(_M_X64)
static constexpr size_t RoundShift = 13;
static constexpr size_t RoundMask = SIMDE_MM_ROUND_MASK;
static constexpr size_t FlushMask = SIMDE_MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
static constexpr size_t GuestToHost[] = { SIMDE_MM_ROUND_NEAREST, SIMDE_MM_ROUND_TOWARD_ZERO, SIMDE_MM_ROUND_UP, SIMDE_MM_ROUND_DOWN };
inline uint32_t getcsr() noexcept
{
return simde_mm_getcsr();
}
inline void setcsr(uint32_t csr) noexcept
{
simde_mm_setcsr(csr);
}
#elif defined(__aarch64__) || defined(_M_ARM64)
// RMode
static constexpr size_t RoundShift = 22;
static constexpr size_t RoundMask = 3 << RoundShift;
// FZ and FZ16
static constexpr size_t FlushMask = (1 << 19) | (1 << 24);
// Nearest, Zero, -Infinity, -Infinity
static constexpr size_t GuestToHost[] = { 0 << RoundShift, 3 << RoundShift, 1 << RoundShift, 2 << RoundShift };
inline uint32_t getcsr() noexcept
{
uint64_t csr;
__asm__ __volatile__("mrs %0, fpcr" : "=r"(csr));
return csr;
}
inline void setcsr(uint32_t csr) noexcept
{
__asm__ __volatile__("msr fpcr, %0" : : "r"(csr));
}
#else
# error "Missing implementation for FPSCR."
#endif
inline uint32_t loadFromHost() noexcept
{
csr = _mm_getcsr();
return HostToGuest[(csr & _MM_ROUND_MASK) >> 13];
csr = getcsr();
return HostToGuest[(csr & RoundMask) >> RoundShift];
}
inline void storeFromGuest(uint32_t value) noexcept
{
csr &= ~_MM_ROUND_MASK;
csr &= ~RoundMask;
csr |= GuestToHost[value & PPC_ROUND_MASK];
_mm_setcsr(csr);
setcsr(csr);
}
static constexpr size_t FlushMask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
inline void enableFlushModeUnconditional() noexcept
{
csr |= FlushMask;
_mm_setcsr(csr);
setcsr(csr);
}
inline void disableFlushModeUnconditional() noexcept
{
csr &= ~FlushMask;
_mm_setcsr(csr);
setcsr(csr);
}
inline void enableFlushMode() noexcept
@ -250,7 +287,7 @@ struct PPCFPSCRRegister
if ((csr & FlushMask) != FlushMask) [[unlikely]]
{
csr |= FlushMask;
_mm_setcsr(csr);
setcsr(csr);
}
}
@ -259,7 +296,7 @@ struct PPCFPSCRRegister
if ((csr & FlushMask) != 0) [[unlikely]]
{
csr &= ~FlushMask;
_mm_setcsr(csr);
setcsr(csr);
}
}
};
@ -587,68 +624,80 @@ inline uint8_t VectorShiftTableR[] =
0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
};
inline __m128i _mm_adds_epu32(__m128i a, __m128i b)
inline simde__m128i simde_mm_adds_epu32(simde__m128i a, simde__m128i b)
{
return _mm_add_epi32(a, _mm_min_epu32(_mm_xor_si128(a, _mm_cmpeq_epi32(a, a)), b));
return simde_mm_add_epi32(a, simde_mm_min_epu32(simde_mm_xor_si128(a, simde_mm_cmpeq_epi32(a, a)), b));
}
inline __m128i _mm_avg_epi8(__m128i a, __m128i b)
inline simde__m128i simde_mm_avg_epi8(simde__m128i a, simde__m128i b)
{
__m128i c = _mm_set1_epi8(char(128));
return _mm_xor_si128(c, _mm_avg_epu8(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
simde__m128i c = simde_mm_set1_epi8(char(128));
return simde_mm_xor_si128(c, simde_mm_avg_epu8(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
}
inline __m128i _mm_avg_epi16(__m128i a, __m128i b)
inline simde__m128i simde_mm_avg_epi16(simde__m128i a, simde__m128i b)
{
__m128i c = _mm_set1_epi16(short(32768));
return _mm_xor_si128(c, _mm_avg_epu16(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
simde__m128i c = simde_mm_set1_epi16(short(32768));
return simde_mm_xor_si128(c, simde_mm_avg_epu16(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
}
inline __m128 _mm_cvtepu32_ps_(__m128i src1)
inline simde__m128 simde_mm_cvtepu32_ps_(simde__m128i src1)
{
__m128i xmm1 = _mm_add_epi32(src1, _mm_set1_epi32(127));
__m128i xmm0 = _mm_slli_epi32(src1, 31 - 8);
xmm0 = _mm_srli_epi32(xmm0, 31);
xmm0 = _mm_add_epi32(xmm0, xmm1);
xmm0 = _mm_srai_epi32(xmm0, 8);
xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(0x4F800000));
__m128 xmm2 = _mm_cvtepi32_ps(src1);
return _mm_blendv_ps(xmm2, _mm_castsi128_ps(xmm0), _mm_castsi128_ps(src1));
simde__m128i xmm1 = simde_mm_add_epi32(src1, simde_mm_set1_epi32(127));
simde__m128i xmm0 = simde_mm_slli_epi32(src1, 31 - 8);
xmm0 = simde_mm_srli_epi32(xmm0, 31);
xmm0 = simde_mm_add_epi32(xmm0, xmm1);
xmm0 = simde_mm_srai_epi32(xmm0, 8);
xmm0 = simde_mm_add_epi32(xmm0, simde_mm_set1_epi32(0x4F800000));
simde__m128 xmm2 = simde_mm_cvtepi32_ps(src1);
return simde_mm_blendv_ps(xmm2, simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(src1));
}
inline __m128i _mm_perm_epi8_(__m128i a, __m128i b, __m128i c)
inline simde__m128i simde_mm_perm_epi8_(simde__m128i a, simde__m128i b, simde__m128i c)
{
__m128i d = _mm_set1_epi8(0xF);
__m128i e = _mm_sub_epi8(d, _mm_and_si128(c, d));
return _mm_blendv_epi8(_mm_shuffle_epi8(a, e), _mm_shuffle_epi8(b, e), _mm_slli_epi32(c, 3));
simde__m128i d = simde_mm_set1_epi8(0xF);
simde__m128i e = simde_mm_sub_epi8(d, simde_mm_and_si128(c, d));
return simde_mm_blendv_epi8(simde_mm_shuffle_epi8(a, e), simde_mm_shuffle_epi8(b, e), simde_mm_slli_epi32(c, 3));
}
inline __m128i _mm_cmpgt_epu8(__m128i a, __m128i b)
inline simde__m128i simde_mm_cmpgt_epu8(simde__m128i a, simde__m128i b)
{
__m128i c = _mm_set1_epi8(char(128));
return _mm_cmpgt_epi8(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
simde__m128i c = simde_mm_set1_epi8(char(128));
return simde_mm_cmpgt_epi8(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
}
inline __m128i _mm_cmpgt_epu16(__m128i a, __m128i b)
inline simde__m128i simde_mm_cmpgt_epu16(simde__m128i a, simde__m128i b)
{
__m128i c = _mm_set1_epi16(short(32768));
return _mm_cmpgt_epi16(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
simde__m128i c = simde_mm_set1_epi16(short(32768));
return simde_mm_cmpgt_epi16(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
}
inline __m128i _mm_vctsxs(__m128 src1)
inline simde__m128i simde_mm_vctsxs(simde__m128 src1)
{
__m128 xmm2 = _mm_cmpunord_ps(src1, src1);
__m128i xmm0 = _mm_cvttps_epi32(src1);
__m128i xmm1 = _mm_cmpeq_epi32(xmm0, _mm_set1_epi32(INT_MIN));
xmm1 = _mm_andnot_si128(_mm_castps_si128(src1), xmm1);
__m128 dest = _mm_blendv_ps(_mm_castsi128_ps(xmm0), _mm_castsi128_ps(_mm_set1_epi32(INT_MAX)), _mm_castsi128_ps(xmm1));
return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest));
simde__m128 xmm2 = simde_mm_cmpunord_ps(src1, src1);
simde__m128i xmm0 = simde_mm_cvttps_epi32(src1);
simde__m128i xmm1 = simde_mm_cmpeq_epi32(xmm0, simde_mm_set1_epi32(INT_MIN));
xmm1 = simde_mm_andnot_si128(simde_mm_castps_si128(src1), xmm1);
simde__m128 dest = simde_mm_blendv_ps(simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(simde_mm_set1_epi32(INT_MAX)), simde_mm_castsi128_ps(xmm1));
return simde_mm_andnot_si128(simde_mm_castps_si128(xmm2), simde_mm_castps_si128(dest));
}
inline __m128i _mm_vsr(__m128i a, __m128i b)
inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b)
{
b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61);
return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10));
b = simde_mm_srli_epi64(simde_mm_slli_epi64(b, 61), 61);
return simde_mm_castps_si128(simde_mm_insert_ps(simde_mm_castsi128_ps(simde_mm_srl_epi64(a, b)), simde_mm_castsi128_ps(simde_mm_srl_epi64(simde_mm_srli_si128(a, 4), b)), 0x10));
}
#if defined(__aarch64__) || defined(_M_ARM64)
inline uint64_t __rdtsc()
{
uint64_t ret;
asm volatile("mrs %0, cntvct_el0\n\t"
: "=r"(ret)::"memory");
return ret;
}
#elif !defined(__x86_64__) && !defined(_M_X64)
# error "Missing implementation for __rdtsc()"
#endif
#endif

View file

@ -141,7 +141,7 @@ inline bool bitScanForward(uint64_t v, uint32_t *outFirstSetIndex)
}
#endif
static int lzxDecompress(const void *lzxData, size_t lzxLength, void *dst, size_t dstLength, uint32_t windowSize, void *windowData, size_t windowDataLength)
int lzxDecompress(const void *lzxData, size_t lzxLength, void *dst, size_t dstLength, uint32_t windowSize, void *windowData, size_t windowDataLength)
{
int resultCode = 1;
uint32_t windowBits;

1
thirdparty/simde vendored Submodule

@ -0,0 +1 @@
Subproject commit a532a12ca9bbdc5e6547eb602e6256b71a5377d4