From b18a1a62067ed877f26f699eaf1fa4836e7831b2 Mon Sep 17 00:00:00 2001 From: JillianTo Date: Sat, 8 Mar 2025 12:47:45 -0500 Subject: [PATCH 1/9] mostly correct --- parser.py | 221 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 parser.py diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..907b3bc --- /dev/null +++ b/parser.py @@ -0,0 +1,221 @@ +## +## Searches for functions in .text that are referenced by functions in .pdata +## +## Input: +## Decompiled code - Created in IDA Pro 9.0SP1 with File -> Produce File -> Create HTML File... +## CLI output from a XenonRecomp run - When trying to compile with XenonRecomp, use > to save the output from the terminal +## +## Output: +## XenonRecomp config - Function block for TOML to be inputted into XenonRecomp +## + +import sys +import re + +# Check if correct number of input arguments were given +if len(sys.argv) != 4: + sys.exit("parser.py [IDA HTML] [XenonRecomp log] [Output TOML]") + +# Filepath input arguments +ida_html = sys.argv[1] +xenonrecomp_log = sys.argv[2] +output_file = sys.argv[3] + +## +## Parse XenonRecomp log +## + +# The starting index of the erroneous switch statement address in the XenonRecomp log +switch_idx = 22 + +# Initialize list to store erroneous switch statement addresses +switch_addrs = [] + +print("Parsing XenonRecomp log...") +# Import each line of XenonRecomp log +with open(xenonrecomp_log, 'r') as file: + # Read each line in the file + for line in file: + # If this line describes an error, it has the address of a problematic switch statement + if re.search('ERROR', line) != None: + # Save the address as integer + switch_addrs.append(line[switch_idx:switch_idx+8]) + +# Save only unique addresses and sort +switch_addrs = set(switch_addrs) + +## +## Parse IDA HTML +## + +# See if current function is referenced by the inputted comparison address +def compare_xref_addr(line, compare_addr): + # Get the address of the referencing function + xref_idx = line.find('CODE XREF: sub_') + # If there is not a referencing function or it is in a different file, this doesn't need to be verified + if xref_idx == -1: + return True + else: + xref = line[xref_idx+15:xref_idx+23] + + # Check equality between XREF address and comparison address + return xref == compare_addr + +# Initialize list to store start and end of functions +functs = [] + +# Count how many functions have been added +num_functs = 0 + +# Mark if we are in .text section +in_text = False + +# Mark if we should end parsing +end_parse = False + +# Initialize address of last padding to 0 +pad_addr = '00000000' + +# Import each line of decompiled code +print("Parsing IDA HTML...") +with open(ida_html, 'r') as file: + # Read each line in the file + for line in file: + if not end_parse: + # If in .text + if in_text: + # Get the current address + colon_idx = line.find(':') + curr_addr = line[colon_idx+1:colon_idx+9] + + # Check if this is the start of a function + if re.search('^\.text:'+curr_addr+' sub_'+curr_addr, line): + # Check if this is a new function and not part of a switch + if num_functs > 0: + # If the referencing function is not the last added function, then it is not part of a switch + equal_xref = compare_xref_addr(line, functs[num_functs-1][0]) + if equal_xref: + # Add this address as a new function + functs.append([curr_addr, 0]) + num_functs = num_functs+1 + # Convert addresses to integer for comparison + curr_addr_int = int(curr_addr, 16) + pad_addr_int = int(pad_addr, 16) + # If previous address was padding, end last function at the padding + if curr_addr_int-4 == pad_addr_int: + functs[num_functs-2][1] = pad_addr_int + # Else, end last function as this address + else: + functs[num_functs-2][1] = curr_addr_int + + # If this is the first function to be added, don't need to check if it is part of a switch + else: + # Add this address as a new function + functs.append([curr_addr, 0]) + num_functs = num_functs+1 + + # If this is not the start of a function + else: + # Check if it is a nested loc_ or def_ + if re.search('^\.text:'+curr_addr+' [ld][oe][cf]_'+curr_addr, line): + # If the referencing function is not the last added function, then it is not part of a switch + if not compare_xref_addr(line, functs[num_functs-1][0]): + # Add this address as a new function + functs.append([curr_addr, 0]) + num_functs = num_functs+1 + # Convert addresses to integer for comparison + curr_addr_int = int(curr_addr, 16) + pad_addr_int = int(pad_addr, 16) + # If previous address was padding, end last function at the padding + if curr_addr_int-4 == pad_addr_int: + functs[num_functs-2][1] = pad_addr_int + # End the last function at the previous address + else: + functs[num_functs-2][1] = curr_addr_int + + # Check if this line is padding + elif re.search('\.long 0$', line): + # Save address of most recently found padding + pad_addr = curr_addr + + # Check if we are still in .text + elif re.search('\.text:', line) == None: + # If not, end parsing + end_parse = True + + # If not in .text + else: + # If .text section header found + if re.search('\.section "\.text"', line) != None: + in_text = True + +## +## Find .text functions that are referenced by .pdata functions +## + +# Initialize list for functions that need to be added to toml +output_functs = [] + +# Look for related functions for every unique errored switch statement +print("Searching for needed functions...") +for switch_addr in switch_addrs: + # Start looking at first subroutine + curr_funct_idx = 0 + + # Save current switch statement address as integer + switch_addr_int = int(switch_addr, 16) + + # The related function for this switch statement has not been found yet + search_for_funct = True + + # Start search for function relating to switch statement + while(search_for_funct): + curr_funct = functs[curr_funct_idx] + # If switch address is after this function's start + curr_funct_start = int(curr_funct[0], 16) + if(switch_addr_int > curr_funct_start): + # If switch address is before this function's end + curr_funct_end = curr_funct[1] + if(switch_addr_int <= curr_funct_end): + # Save current function's start address and the function's length + output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start)]) + # Don't need to continue search for this switch statement + search_for_funct = False + + # Look in next function + curr_funct_idx = curr_funct_idx + 1 + + # Related function was not found + else: + print(f"WARNING: Function relating to {switch_addr} not found") + # Don't need to continue search for this switch statement + search_for_funct = False + +print(f"{len(output_functs)} functions found!") + +# Create formatted string to export to TOML +output_str = "functions = [" + +# Append all function addresses and lengths to formatted string +for funct in output_functs: + # Format hex to uppercase + curr_funct_start = '0x'+funct[0][2:].upper() + curr_funct_end = '0x'+funct[1][2:].upper() + + # Format function + curr_funct = "\n { address = "+curr_funct_start+", size = "+curr_funct_end+" }," + + # Add to complete output string + output_str = output_str+curr_funct + +# Delete last comma +output_str = output_str[:len(output_str)-1] + +# Add last bracket +output_str = output_str+"\n]" + +# Output to file +with open(output_file, "w") as file: + file.write(output_str) + + From fe3fdbdda5a5f5051f43c4743aa576997b666636 Mon Sep 17 00:00:00 2001 From: JillianTo Date: Sat, 8 Mar 2025 13:00:48 -0500 Subject: [PATCH 2/9] fixed typo --- parser.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/parser.py b/parser.py index 907b3bc..8577788 100644 --- a/parser.py +++ b/parser.py @@ -89,12 +89,11 @@ with open(ida_html, 'r') as file: curr_addr = line[colon_idx+1:colon_idx+9] # Check if this is the start of a function - if re.search('^\.text:'+curr_addr+' sub_'+curr_addr, line): + if re.search('^\.text:'+curr_addr+' sub_'+curr_addr, line): # Check if this is a new function and not part of a switch if num_functs > 0: # If the referencing function is not the last added function, then it is not part of a switch - equal_xref = compare_xref_addr(line, functs[num_functs-1][0]) - if equal_xref: + if not compare_xref_addr(line, functs[num_functs-1][0]): # Add this address as a new function functs.append([curr_addr, 0]) num_functs = num_functs+1 From 6dbbc6ea147517d96fb551f8210f1acaf9d8db20 Mon Sep 17 00:00:00 2001 From: JillianTo Date: Sat, 8 Mar 2025 21:22:26 -0500 Subject: [PATCH 3/9] only gets address/size of 0x82893088 and size of 0x82CF7080 wrong --- parser.py | 126 +++++++++++++++++++++++++++--------------------------- 1 file changed, 62 insertions(+), 64 deletions(-) diff --git a/parser.py b/parser.py index 8577788..7297e38 100644 --- a/parser.py +++ b/parser.py @@ -48,31 +48,33 @@ switch_addrs = set(switch_addrs) ## Parse IDA HTML ## -# See if current function is referenced by the inputted comparison address -def compare_xref_addr(line, compare_addr): - # Get the address of the referencing function - xref_idx = line.find('CODE XREF: sub_') - # If there is not a referencing function or it is in a different file, this doesn't need to be verified - if xref_idx == -1: - return True - else: - xref = line[xref_idx+15:xref_idx+23] - - # Check equality between XREF address and comparison address - return xref == compare_addr - # Initialize list to store start and end of functions functs = [] # Count how many functions have been added num_functs = 0 +# Function for adding to function list and incrementing count +def add_function(new_start_addr, prev_end_addr): + global num_functs + # If an end address for the last added function was specified + if prev_end_addr != None: + # Set end address for last added function + functs[num_functs-1][1] = prev_end_addr + # Add a new function to the list with the specified starting address + functs.append([new_start_addr, 0, []]) + # Increment the number of functions + num_functs = num_functs+1 + # Mark if we are in .text section in_text = False # Mark if we should end parsing end_parse = False +# Initialize address of last blr instruction to 0 +blr_addr = '00000000' + # Initialize address of last padding to 0 pad_addr = '00000000' @@ -89,58 +91,54 @@ with open(ida_html, 'r') as file: curr_addr = line[colon_idx+1:colon_idx+9] # Check if this is the start of a function - if re.search('^\.text:'+curr_addr+' sub_'+curr_addr, line): - # Check if this is a new function and not part of a switch + if re.search('^\.text:'+curr_addr+' sub_'+curr_addr+': *# [A-Z][A-Z][A-Z][A-Z] XREF:.*', line): + # Save current address as integer + curr_addr_int = int(curr_addr, 16) + if num_functs > 0: - # If the referencing function is not the last added function, then it is not part of a switch - if not compare_xref_addr(line, functs[num_functs-1][0]): - # Add this address as a new function - functs.append([curr_addr, 0]) - num_functs = num_functs+1 - # Convert addresses to integer for comparison - curr_addr_int = int(curr_addr, 16) - pad_addr_int = int(pad_addr, 16) - # If previous address was padding, end last function at the padding - if curr_addr_int-4 == pad_addr_int: - functs[num_functs-2][1] = pad_addr_int - # Else, end last function as this address - else: - functs[num_functs-2][1] = curr_addr_int - - # If this is the first function to be added, don't need to check if it is part of a switch + # If last address had padding, then this function was already added + if not curr_addr_int-4 == int(pad_addr, 16): + # Check if this function is part of latest added function + is_nested_funct = False + nested_functs = functs[num_functs-1][2] + for nested_funct in nested_functs: + is_nested_funct = nested_funct==curr_addr + + # If last address was not padding and not nested in latest function + if not is_nested_funct: + # If this is not the first function being added + if num_functs > 0: + # Add new function and last function's end address + add_function(curr_addr_int, curr_addr_int) else: - # Add this address as a new function - functs.append([curr_addr, 0]) - num_functs = num_functs+1 + # Add new function + add_function(curr_addr_int, None) - # If this is not the start of a function - else: - # Check if it is a nested loc_ or def_ - if re.search('^\.text:'+curr_addr+' [ld][oe][cf]_'+curr_addr, line): - # If the referencing function is not the last added function, then it is not part of a switch - if not compare_xref_addr(line, functs[num_functs-1][0]): - # Add this address as a new function - functs.append([curr_addr, 0]) - num_functs = num_functs+1 - # Convert addresses to integer for comparison - curr_addr_int = int(curr_addr, 16) - pad_addr_int = int(pad_addr, 16) - # If previous address was padding, end last function at the padding - if curr_addr_int-4 == pad_addr_int: - functs[num_functs-2][1] = pad_addr_int - # End the last function at the previous address - else: - functs[num_functs-2][1] = curr_addr_int - - # Check if this line is padding - elif re.search('\.long 0$', line): - # Save address of most recently found padding - pad_addr = curr_addr + # If this is a location + elif re.search('^\.text:'+curr_addr+' loc_'+curr_addr, line): + curr_addr_int = int(curr_addr, 16) + # If previous address was a blr instruction + if curr_addr_int-4 == blr_addr: + print(curr_addr) + add_function(curr_addr_int, curr_addr_int) + # If not, store as nested function in latest function + else: + # Find address of function that references this + xref_idx = line.find('XREF: sub_') + # If it was found + if xref_idx > -1: + # Store as nested function in latest function + functs[num_functs-1][2].append(line[xref_idx+10:xref_idx+18]) - # Check if we are still in .text - elif re.search('\.text:', line) == None: - # If not, end parsing - end_parse = True + # Check if this line is padding + elif num_functs > 0 and re.search('\.long 0$', line): + curr_addr_int = int(curr_addr, 16) + # Add a new function at the line after padding, and end the current function at this padding address + add_function(curr_addr_int+4, curr_addr_int) + + # Check for blr instruction + elif re.search('blr', line): + blr_addr = curr_addr # If not in .text else: @@ -171,13 +169,13 @@ for switch_addr in switch_addrs: while(search_for_funct): curr_funct = functs[curr_funct_idx] # If switch address is after this function's start - curr_funct_start = int(curr_funct[0], 16) + curr_funct_start = curr_funct[0] if(switch_addr_int > curr_funct_start): # If switch address is before this function's end curr_funct_end = curr_funct[1] if(switch_addr_int <= curr_funct_end): # Save current function's start address and the function's length - output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start)]) + output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start), switch_addr]) # Don't need to continue search for this switch statement search_for_funct = False @@ -202,7 +200,7 @@ for funct in output_functs: curr_funct_end = '0x'+funct[1][2:].upper() # Format function - curr_funct = "\n { address = "+curr_funct_start+", size = "+curr_funct_end+" }," + curr_funct = "\n { address = "+curr_funct_start+", size = "+curr_funct_end+" src = "+funct[2]+" }," # Add to complete output string output_str = output_str+curr_funct From 21f1a81aa30520d71d35b54d55100f68e87da8cd Mon Sep 17 00:00:00 2001 From: JillianTo Date: Sun, 9 Mar 2025 12:53:48 -0400 Subject: [PATCH 4/9] missing 3 functions, 0x82F08730 starts at wrong address --- README.md | 6 +++ parser.py | 120 +++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 107 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 01d2542..b13285e 100644 --- a/README.md +++ b/README.md @@ -188,6 +188,12 @@ functions = [ You can define function boundaries explicitly using the `functions` property if XenonAnalyse fails to analyze them correctly, for example, with functions containing jump tables. +You can automatically generate these using the FunctionParser.py script. You will need to create a HTML of your decompiled XEX with IDA using `File -> Produce File -> Create HTML File...` and save the terminal output from running XenonRecomp by appending `> [output log file path]` to the command. + +``` +python3 FunctionParser.py [input IDA HTML file path] [input XenonRecomp log file path] [output function list file path] +``` + #### Invalid Instruction Skips ```toml diff --git a/parser.py b/parser.py index 7297e38..d3699b6 100644 --- a/parser.py +++ b/parser.py @@ -21,6 +21,9 @@ ida_html = sys.argv[1] xenonrecomp_log = sys.argv[2] output_file = sys.argv[3] +# Disable extra debug output +debug = False + ## ## Parse XenonRecomp log ## @@ -37,7 +40,7 @@ with open(xenonrecomp_log, 'r') as file: # Read each line in the file for line in file: # If this line describes an error, it has the address of a problematic switch statement - if re.search('ERROR', line) != None: + if re.search('ERROR: Switch case at ', line) != None: # Save the address as integer switch_addrs.append(line[switch_idx:switch_idx+8]) @@ -55,14 +58,14 @@ functs = [] num_functs = 0 # Function for adding to function list and incrementing count -def add_function(new_start_addr, prev_end_addr): +def add_function(new_start_addr, prev_end_addr, start_type): global num_functs # If an end address for the last added function was specified if prev_end_addr != None: # Set end address for last added function functs[num_functs-1][1] = prev_end_addr # Add a new function to the list with the specified starting address - functs.append([new_start_addr, 0, []]) + functs.append([new_start_addr, 0, [], start_type]) # Increment the number of functions num_functs = num_functs+1 @@ -75,8 +78,11 @@ end_parse = False # Initialize address of last blr instruction to 0 blr_addr = '00000000' +# Initialize address of last bctr instruction to 0 +bctr_addr = '00000000' + # Initialize address of last padding to 0 -pad_addr = '00000000' +pad_addr = 0 # Import each line of decompiled code print("Parsing IDA HTML...") @@ -95,32 +101,80 @@ with open(ida_html, 'r') as file: # Save current address as integer curr_addr_int = int(curr_addr, 16) + # If this is not the first function being added if num_functs > 0: # If last address had padding, then this function was already added - if not curr_addr_int-4 == int(pad_addr, 16): + if curr_addr_int-4 == pad_addr: + # Set function type for start address + functs[num_functs-1][3] = 'sub' + else: # Check if this function is part of latest added function is_nested_funct = False nested_functs = functs[num_functs-1][2] for nested_funct in nested_functs: - is_nested_funct = nested_funct==curr_addr + if nested_funct == curr_addr: + is_nested_funct = True + break # If last address was not padding and not nested in latest function if not is_nested_funct: - # If this is not the first function being added - if num_functs > 0: - # Add new function and last function's end address - add_function(curr_addr_int, curr_addr_int) + # Add new function and last function's end address + add_function(curr_addr_int, curr_addr_int, 'sub') else: # Add new function - add_function(curr_addr_int, None) + add_function(curr_addr_int, None, 'sub') # If this is a location elif re.search('^\.text:'+curr_addr+' loc_'+curr_addr, line): curr_addr_int = int(curr_addr, 16) + curr_funct = functs[num_functs-1] # If previous address was a blr instruction - if curr_addr_int-4 == blr_addr: - print(curr_addr) - add_function(curr_addr_int, curr_addr_int) + if curr_addr_int-4 == int(blr_addr, 16): + # If last added function is a subroutine and has no nested functions + if curr_funct[3] == 'sub' and not curr_funct[2]: + xref_idx = line.find('XREF: sub_') + # If XREF is a subroutine + if xref_idx > -1: + xref = line[xref_idx+10:xref_idx+18] + # If the XREF is equivalent to the last function's start address + if int(xref, 16) == curr_funct[0]: + # Store as nested function in latest function + functs[num_functs-1][2].append(xref) + # If not, add this address as a new function + else: + add_function(curr_addr_int, curr_addr_int, 'loc') + # If not, add this address as new function + else: + add_function(curr_addr_int, curr_addr_int, 'loc') + + # If last added function is not a subroutine or has nested functions: + else: + # Check for XREF to subroutine + xref_idx = line.find('XREF: sub_') + if xref_idx > -1: + xref = line[xref_idx+10:xref_idx+18] + # If not found, check for XREF to .text address + else: + xref_idx = line.find('XREF: .text:') + if xref_idx > -1: + underscore_idx = line.find('_', xref_idx) + # If referencing sub_, loc_, etc. + if underscore_idx > -1: + xref = line[underscore_idx+1:underscore_idx+9] + # Else, there's only the address after .text + else: + xref = line[xref_idx+12:xref_idx+20] + else: + xref = '-1' + + # If XREF points to subroutine or .text address before current address + if int(xref, 16) < curr_addr_int: + # Store as nested function + functs[num_functs-1][2].append(xref) + # If not, add this address as new funciton + else: + add_function(curr_addr_int, curr_addr_int, 'loc') + # If not, store as nested function in latest function else: # Find address of function that references this @@ -132,9 +186,14 @@ with open(ida_html, 'r') as file: # Check if this line is padding elif num_functs > 0 and re.search('\.long 0$', line): + # Convert current address to integer curr_addr_int = int(curr_addr, 16) + # Add a new function at the line after padding, and end the current function at this padding address - add_function(curr_addr_int+4, curr_addr_int) + add_function(curr_addr_int+4, curr_addr_int, None) + + # Save padding address + pad_addr = curr_addr_int # Check for blr instruction elif re.search('blr', line): @@ -175,7 +234,11 @@ for switch_addr in switch_addrs: curr_funct_end = curr_funct[1] if(switch_addr_int <= curr_funct_end): # Save current function's start address and the function's length - output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start), switch_addr]) + if debug: + output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start), switch_addr]) + else: + output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start)]) + # Don't need to continue search for this switch statement search_for_funct = False @@ -184,23 +247,42 @@ for switch_addr in switch_addrs: # Related function was not found else: - print(f"WARNING: Function relating to {switch_addr} not found") + print(f"WARNING: Function relating to {switch_addr} not found! Skipping.") # Don't need to continue search for this switch statement search_for_funct = False -print(f"{len(output_functs)} functions found!") +# Remove duplicates +if not debug: + output_functs = list(set(tuple(funct) for funct in output_functs)) + +# Make sure there are no functions with the same starting address but different lengths +for i in range(len(output_functs)): + for j in range(i+1, len(output_functs)): + curr_funct_start = output_functs[i][0] + if curr_funct_start == output_functs[j][0]: + print(f"WARNING: {curr_funct_start} has multiple entries of different lengths, manually find correct one.") + +print(f"{len(output_functs)} functions found!") + +## +## Output all found functions to TOML in correct format +## # Create formatted string to export to TOML output_str = "functions = [" # Append all function addresses and lengths to formatted string +print("Outputting to formatted file...") for funct in output_functs: # Format hex to uppercase curr_funct_start = '0x'+funct[0][2:].upper() curr_funct_end = '0x'+funct[1][2:].upper() # Format function - curr_funct = "\n { address = "+curr_funct_start+", size = "+curr_funct_end+" src = "+funct[2]+" }," + curr_funct = "\n { address = "+curr_funct_start+", size = "+curr_funct_end + if debug: + curr_funct = curr_funct+", src = "+funct[2] + curr_funct = curr_funct+" }," # Add to complete output string output_str = output_str+curr_funct From 2365f4d69761cc04a7627edbdf2dcc5b36285f21 Mon Sep 17 00:00:00 2001 From: JillianTo Date: Sun, 9 Mar 2025 17:56:37 -0400 Subject: [PATCH 5/9] missing 3 functions, but otherwise correct --- parser.py | 94 ++++++++++++++++++++++++++----------------------------- 1 file changed, 45 insertions(+), 49 deletions(-) diff --git a/parser.py b/parser.py index d3699b6..05da7f0 100644 --- a/parser.py +++ b/parser.py @@ -75,11 +75,17 @@ in_text = False # Mark if we should end parsing end_parse = False +# Initialize address of last bctr instruction to 0 +bctr_addr = '00000000' + # Initialize address of last blr instruction to 0 blr_addr = '00000000' -# Initialize address of last bctr instruction to 0 -bctr_addr = '00000000' +# Initialize address of last 'End of function' comment to 0 +eof_addr = '00000000' + +# Initialize address of last restgprlr instruction to 0 +restgprlr_addr = '00000000' # Initialize address of last padding to 0 pad_addr = 0 @@ -103,8 +109,8 @@ with open(ida_html, 'r') as file: # If this is not the first function being added if num_functs > 0: - # If last address had padding, then this function was already added - if curr_addr_int-4 == pad_addr: + # If last address had padding or restgprlr instruction, then this function was already added + if curr_addr_int-4 == pad_addr or curr_addr_int-4 == restgprlr_addr: # Set function type for start address functs[num_functs-1][3] = 'sub' else: @@ -130,59 +136,31 @@ with open(ida_html, 'r') as file: curr_funct = functs[num_functs-1] # If previous address was a blr instruction if curr_addr_int-4 == int(blr_addr, 16): - # If last added function is a subroutine and has no nested functions - if curr_funct[3] == 'sub' and not curr_funct[2]: - xref_idx = line.find('XREF: sub_') - # If XREF is a subroutine + # If previous address had an 'End of function' comment or if there was a bctr with the comment + if blr_addr == eof_addr or bctr_addr == eof_addr: + # Find a XREF pointing to a .text address + xref_idx = line.find('XREF: .text:') if xref_idx > -1: - xref = line[xref_idx+10:xref_idx+18] - # If the XREF is equivalent to the last function's start address - if int(xref, 16) == curr_funct[0]: - # Store as nested function in latest function - functs[num_functs-1][2].append(xref) - # If not, add this address as a new function - else: - add_function(curr_addr_int, curr_addr_int, 'loc') - # If not, add this address as new function - else: - add_function(curr_addr_int, curr_addr_int, 'loc') - - # If last added function is not a subroutine or has nested functions: - else: - # Check for XREF to subroutine - xref_idx = line.find('XREF: sub_') - if xref_idx > -1: - xref = line[xref_idx+10:xref_idx+18] - # If not found, check for XREF to .text address - else: - xref_idx = line.find('XREF: .text:') - if xref_idx > -1: - underscore_idx = line.find('_', xref_idx) - # If referencing sub_, loc_, etc. - if underscore_idx > -1: - xref = line[underscore_idx+1:underscore_idx+9] - # Else, there's only the address after .text - else: - xref = line[xref_idx+12:xref_idx+20] + underscore_idx = line.find('_', xref_idx) + if underscore_idx > -1: + xref = line[underscore_idx+1:underscore_idx+9] else: - xref = '-1' - - # If XREF points to subroutine or .text address before current address - if int(xref, 16) < curr_addr_int: - # Store as nested function - functs[num_functs-1][2].append(xref) - # If not, add this address as new funciton + xref = line[xref_idx+12:xref_idx+20] else: + xref = None + + # Couldn't find XREF pointing to .text address or the XREF is after this address + if xref == None or int(xref, 16) > curr_addr_int: + # Add as new function add_function(curr_addr_int, curr_addr_int, 'loc') - # If not, store as nested function in latest function else: # Find address of function that references this - xref_idx = line.find('XREF: sub_') + xref_idx = line.find('CODE XREF: sub_') # If it was found if xref_idx > -1: # Store as nested function in latest function - functs[num_functs-1][2].append(line[xref_idx+10:xref_idx+18]) + functs[num_functs-1][2].append(line[xref_idx+15:xref_idx+23]) # Check if this line is padding elif num_functs > 0 and re.search('\.long 0$', line): @@ -196,8 +174,26 @@ with open(ida_html, 'r') as file: pad_addr = curr_addr_int # Check for blr instruction - elif re.search('blr', line): - blr_addr = curr_addr + elif re.search('blr$', line): + blr_addr = curr_addr + + # Check for 'End of function' comment + elif re.search('End of function ', line): + eof_addr = curr_addr + + # Check for bctr instruction + elif re.search('bctr$', line): + bctr_addr = curr_addr + + # Check for restgprlr instruction + elif re.search('b __restgprlr_[0-9][0-9]$', line): + # Convert current address to integer + curr_addr_int = int(curr_addr, 16) + + # Add a new function at the line after restgprlr instruction, and end the current function at this address + add_function(curr_addr_int+4, curr_addr_int, None) + + restgprlr_addr = curr_addr_int # If not in .text else: From 444ee2bda177dc25aa8c151acc1553ba474dbb7f Mon Sep 17 00:00:00 2001 From: JillianTo Date: Sun, 9 Mar 2025 19:10:49 -0400 Subject: [PATCH 6/9] don't run duplicate check during debug because we allow duplicates during that, rename file --- parser.py => Auto_Function_Parser.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) rename parser.py => Auto_Function_Parser.py (97%) diff --git a/parser.py b/Auto_Function_Parser.py similarity index 97% rename from parser.py rename to Auto_Function_Parser.py index 05da7f0..cce8094 100644 --- a/parser.py +++ b/Auto_Function_Parser.py @@ -252,11 +252,12 @@ if not debug: output_functs = list(set(tuple(funct) for funct in output_functs)) # Make sure there are no functions with the same starting address but different lengths -for i in range(len(output_functs)): - for j in range(i+1, len(output_functs)): - curr_funct_start = output_functs[i][0] - if curr_funct_start == output_functs[j][0]: - print(f"WARNING: {curr_funct_start} has multiple entries of different lengths, manually find correct one.") +if not debug: + for i in range(len(output_functs)): + for j in range(i+1, len(output_functs)): + curr_funct_start = output_functs[i][0] + if curr_funct_start == output_functs[j][0]: + print(f"WARNING: {curr_funct_start} has multiple entries of different lengths, manually find correct one.") print(f"{len(output_functs)} functions found!") From 9a4dc311c7776fd6618987fd1f1d6f09fdff623c Mon Sep 17 00:00:00 2001 From: JillianTo Date: Sun, 9 Mar 2025 19:12:39 -0400 Subject: [PATCH 7/9] fix name in script --- Auto_Function_Parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Auto_Function_Parser.py b/Auto_Function_Parser.py index cce8094..7177b8e 100644 --- a/Auto_Function_Parser.py +++ b/Auto_Function_Parser.py @@ -14,7 +14,7 @@ import re # Check if correct number of input arguments were given if len(sys.argv) != 4: - sys.exit("parser.py [IDA HTML] [XenonRecomp log] [Output TOML]") + sys.exit("Auto_Function_Parser.py [IDA HTML] [XenonRecomp log] [Output TOML]") # Filepath input arguments ida_html = sys.argv[1] From 4452868029753602a884068b63aa72c3dbba6805 Mon Sep 17 00:00:00 2001 From: JillianTo Date: Sun, 9 Mar 2025 20:53:46 -0400 Subject: [PATCH 8/9] When checking for subroutine, don't make .text have to be at the beginning of the line, this accounts for some weird HTML formatting --- Auto_Function_Parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Auto_Function_Parser.py b/Auto_Function_Parser.py index 7177b8e..2e584c2 100644 --- a/Auto_Function_Parser.py +++ b/Auto_Function_Parser.py @@ -103,7 +103,7 @@ with open(ida_html, 'r') as file: curr_addr = line[colon_idx+1:colon_idx+9] # Check if this is the start of a function - if re.search('^\.text:'+curr_addr+' sub_'+curr_addr+': *# [A-Z][A-Z][A-Z][A-Z] XREF:.*', line): + if re.search('.text:'+curr_addr+' sub_'+curr_addr+': *# [A-Z][A-Z][A-Z][A-Z] XREF:', line): # Save current address as integer curr_addr_int = int(curr_addr, 16) From 8fc280bed99903d7bfaf1003e18cfec0c627141d Mon Sep 17 00:00:00 2001 From: JillianTo Date: Tue, 11 Mar 2025 15:54:36 -0400 Subject: [PATCH 9/9] use raw strings to avoid escape character syntax errors in python 3.12 --- Auto_Function_Parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Auto_Function_Parser.py b/Auto_Function_Parser.py index 2e584c2..d86a23e 100644 --- a/Auto_Function_Parser.py +++ b/Auto_Function_Parser.py @@ -40,7 +40,7 @@ with open(xenonrecomp_log, 'r') as file: # Read each line in the file for line in file: # If this line describes an error, it has the address of a problematic switch statement - if re.search('ERROR: Switch case at ', line) != None: + if re.search('ERROR: Switch case at ', line): # Save the address as integer switch_addrs.append(line[switch_idx:switch_idx+8]) @@ -103,7 +103,7 @@ with open(ida_html, 'r') as file: curr_addr = line[colon_idx+1:colon_idx+9] # Check if this is the start of a function - if re.search('.text:'+curr_addr+' sub_'+curr_addr+': *# [A-Z][A-Z][A-Z][A-Z] XREF:', line): + if re.search(r'\.text:'+curr_addr+' sub_'+curr_addr+': *# [A-Z][A-Z][A-Z][A-Z] XREF:', line): # Save current address as integer curr_addr_int = int(curr_addr, 16) @@ -131,7 +131,7 @@ with open(ida_html, 'r') as file: add_function(curr_addr_int, None, 'sub') # If this is a location - elif re.search('^\.text:'+curr_addr+' loc_'+curr_addr, line): + elif re.search(r'^\.text:'+curr_addr+' loc_'+curr_addr, line): curr_addr_int = int(curr_addr, 16) curr_funct = functs[num_functs-1] # If previous address was a blr instruction @@ -163,7 +163,7 @@ with open(ida_html, 'r') as file: functs[num_functs-1][2].append(line[xref_idx+15:xref_idx+23]) # Check if this line is padding - elif num_functs > 0 and re.search('\.long 0$', line): + elif num_functs > 0 and re.search(r'\.long 0$', line): # Convert current address to integer curr_addr_int = int(curr_addr, 16) @@ -198,7 +198,7 @@ with open(ida_html, 'r') as file: # If not in .text else: # If .text section header found - if re.search('\.section "\.text"', line) != None: + if re.search(r'\.section "\.text"', line): in_text = True ##