Merge 8fc280bed9 into 5a945705de

use raw strings to avoid escape character syntax errors in python 3.12
When checking for subroutine, don't make .text have to be at the beginning of the line, this accounts for some weird HTML formatting
2025-10-30 07:11:38 +00:00 · 2025-07-19 14:14:15 +03:00 · 2025-03-11 15:54:36 -04:00 · 2025-03-09 20:53:46 -04:00 · 2025-03-09 19:12:39 -04:00 · 2025-03-09 19:10:49 -04:00
2 changed files with 303 additions and 0 deletions
--- a/Auto_Function_Parser.py
+++ b/Auto_Function_Parser.py
@ -0,0 +1,297 @@
+##
+## Searches for functions in .text that are referenced by functions in .pdata
+##
+## Input: 
+## Decompiled code - Created in IDA Pro 9.0SP1 with File -> Produce File -> Create HTML File...
+## CLI output from a XenonRecomp run - When trying to compile with XenonRecomp, use > to save the output from the terminal
+##
+## Output: 
+## XenonRecomp config - Function block for TOML to be inputted into XenonRecomp 
+##
+
+import sys
+import re
+
+# Check if correct number of input arguments were given
+if len(sys.argv) != 4:
+    sys.exit("Auto_Function_Parser.py [IDA HTML] [XenonRecomp log] [Output TOML]")
+
+# Filepath input arguments
+ida_html = sys.argv[1]
+xenonrecomp_log = sys.argv[2]
+output_file = sys.argv[3]
+
+# Disable extra debug output 
+debug = False
+
+##
+## Parse XenonRecomp log
+##
+
+# The starting index of the erroneous switch statement address in the XenonRecomp log
+switch_idx = 22
+
+# Initialize list to store erroneous switch statement addresses
+switch_addrs = []
+    
+print("Parsing XenonRecomp log...")
+# Import each line of XenonRecomp log
+with open(xenonrecomp_log, 'r') as file:
+    # Read each line in the file
+    for line in file: 
+        # If this line describes an error, it has the address of a problematic switch statement
+        if re.search('ERROR: Switch case at ', line):
+            # Save the address as integer
+            switch_addrs.append(line[switch_idx:switch_idx+8])
+
+# Save only unique addresses and sort
+switch_addrs = set(switch_addrs)
+
+##
+## Parse IDA HTML
+##
+
+# Initialize list to store start and end of functions 
+functs = []
+
+# Count how many functions have been added
+num_functs = 0
+
+# Function for adding to function list and incrementing count
+def add_function(new_start_addr, prev_end_addr, start_type):
+    global num_functs
+    # If an end address for the last added function was specified
+    if prev_end_addr != None:
+        # Set end address for last added function
+        functs[num_functs-1][1] = prev_end_addr
+    # Add a new function to the list with the specified starting address
+    functs.append([new_start_addr, 0, [], start_type])
+    # Increment the number of functions
+    num_functs = num_functs+1
+
+# Mark if we are in .text section
+in_text = False
+
+# Mark if we should end parsing
+end_parse = False
+
+# Initialize address of last bctr instruction to 0
+bctr_addr = '00000000'
+
+# Initialize address of last blr instruction to 0
+blr_addr = '00000000'
+
+# Initialize address of last 'End of function' comment to 0
+eof_addr = '00000000'
+
+# Initialize address of last restgprlr instruction to 0
+restgprlr_addr = '00000000'
+
+# Initialize address of last padding to 0
+pad_addr = 0
+
+# Import each line of decompiled code
+print("Parsing IDA HTML...")
+with open(ida_html, 'r') as file:
+    # Read each line in the file
+    for line in file:
+        if not end_parse:
+            # If in .text
+            if in_text:
+                # Get the current address
+                colon_idx = line.find(':')
+                curr_addr = line[colon_idx+1:colon_idx+9]
+
+                # Check if this is the start of a function
+                if re.search(r'\.text:'+curr_addr+' </span><span class="c[0-9]*">sub_'+curr_addr+'</span><span class="c[0-9]*">: *</span><span class="c[0-9]*"># [A-Z][A-Z][A-Z][A-Z] XREF:', line):
+                    # Save current address as integer
+                    curr_addr_int = int(curr_addr, 16)
+
+                    # If this is not the first function being added
+                    if num_functs > 0:
+                        # If last address had padding or restgprlr instruction, then this function was already added
+                        if curr_addr_int-4 == pad_addr or curr_addr_int-4 == restgprlr_addr:
+                            # Set function type for start address
+                            functs[num_functs-1][3] = 'sub'
+                        else:
+                            # Check if this function is part of latest added function
+                            is_nested_funct = False
+                            nested_functs = functs[num_functs-1][2]
+                            for nested_funct in nested_functs:
+                                if nested_funct == curr_addr:
+                                    is_nested_funct = True
+                                    break
+                            
+                            # If last address was not padding and not nested in latest function
+                            if not is_nested_funct:
+                                # Add new function and last function's end address
+                                add_function(curr_addr_int, curr_addr_int, 'sub')
+                    else:
+                        # Add new function
+                        add_function(curr_addr_int, None, 'sub')
+
+                # If this is a location
+                elif re.search(r'^\.text:'+curr_addr+' </span><span class="c[0-9]*">loc_'+curr_addr, line):
+                    curr_addr_int = int(curr_addr, 16)
+                    curr_funct = functs[num_functs-1]
+                    # If previous address was a blr instruction
+                    if curr_addr_int-4 == int(blr_addr, 16):
+                        # If previous address had an 'End of function' comment or if there was a bctr with the comment
+                        if blr_addr == eof_addr or bctr_addr == eof_addr:
+                            # Find a XREF pointing to a .text address
+                            xref_idx = line.find('XREF: .text:')
+                            if xref_idx > -1:
+                                underscore_idx = line.find('_', xref_idx)
+                                if underscore_idx > -1:
+                                    xref = line[underscore_idx+1:underscore_idx+9]
+                                else:
+                                    xref = line[xref_idx+12:xref_idx+20]
+                            else:
+                                xref = None
+
+                            # Couldn't find XREF pointing to .text address or the XREF is after this address
+                            if xref == None or int(xref, 16) > curr_addr_int:
+                                # Add as new function
+                                add_function(curr_addr_int, curr_addr_int, 'loc')
+
+                    else:
+                        # Find address of function that references this
+                        xref_idx = line.find('CODE XREF: sub_')
+                        # If it was found
+                        if xref_idx > -1:
+                            # Store as nested function in latest function
+                            functs[num_functs-1][2].append(line[xref_idx+15:xref_idx+23])
+
+                # Check if this line is padding
+                elif num_functs > 0 and re.search(r'<span class="c[0-9]*">\.long </span><span class="c[0-9]*">0$', line):
+                    # Convert current address to integer 
+                    curr_addr_int = int(curr_addr, 16)
+
+                    # Add a new function at the line after padding, and end the current function at this padding address
+                    add_function(curr_addr_int+4, curr_addr_int, None)
+                    
+                    # Save padding address
+                    pad_addr = curr_addr_int
+
+                # Check for blr instruction
+                elif re.search('<span class="c[0-9]*">blr$', line):
+                    blr_addr = curr_addr
+
+                # Check for 'End of function' comment
+                elif re.search('End of function ', line):
+                    eof_addr = curr_addr
+    
+                # Check for bctr instruction
+                elif re.search('<span class="c[0-9]*">bctr$', line):
+                    bctr_addr = curr_addr
+
+                # Check for restgprlr instruction
+                elif re.search('<span class="c[0-9]*">b         </span><span class="c[0-9]*">__restgprlr_[0-9][0-9]$', line):
+                    # Convert current address to integer 
+                    curr_addr_int = int(curr_addr, 16)
+
+                    # Add a new function at the line after restgprlr instruction, and end the current function at this address
+                    add_function(curr_addr_int+4, curr_addr_int, None)
+                    
+                    restgprlr_addr = curr_addr_int
+
+            # If not in .text
+            else:
+                # If .text section header found
+                if re.search(r'<span class="c[0-9]*">\.section &quot;\.text&quot;', line):
+                    in_text = True
+
+##
+## Find .text functions that are referenced by .pdata functions
+##
+
+# Initialize list for functions that need to be added to toml
+output_functs = []
+
+# Look for related functions for every unique errored switch statement
+print("Searching for needed functions...")
+for switch_addr in switch_addrs:
+    # Start looking at first subroutine
+    curr_funct_idx = 0
+
+    # Save current switch statement address as integer
+    switch_addr_int = int(switch_addr, 16)
+
+    # The related function for this switch statement has not been found yet
+    search_for_funct = True
+
+    # Start search for function relating to switch statement
+    while(search_for_funct):
+        curr_funct = functs[curr_funct_idx]
+        # If switch address is after this function's start
+        curr_funct_start = curr_funct[0]
+        if(switch_addr_int > curr_funct_start):
+            # If switch address is before this function's end
+            curr_funct_end = curr_funct[1]
+            if(switch_addr_int <= curr_funct_end):
+                # Save current function's start address and the function's length
+                if debug:
+                    output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start), switch_addr])
+                else:
+                    output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start)])
+            
+                # Don't need to continue search for this switch statement
+                search_for_funct = False
+
+            # Look in next function
+            curr_funct_idx = curr_funct_idx + 1
+
+        # Related function was not found
+        else:
+            print(f"WARNING: Function relating to {switch_addr} not found! Skipping.")
+            # Don't need to continue search for this switch statement
+            search_for_funct = False
+
+# Remove duplicates
+if not debug: 
+    output_functs = list(set(tuple(funct) for funct in output_functs))
+
+# Make sure there are no functions with the same starting address but different lengths
+if not debug:
+    for i in range(len(output_functs)):
+        for j in range(i+1, len(output_functs)):
+            curr_funct_start = output_functs[i][0]
+            if curr_funct_start == output_functs[j][0]:
+                print(f"WARNING: {curr_funct_start} has multiple entries of different lengths, manually find correct one.")
+
+print(f"{len(output_functs)} functions found!")
+
+##
+## Output all found functions to TOML in correct format
+##
+
+# Create formatted string to export to TOML
+output_str = "functions = ["
+
+# Append all function addresses and lengths to formatted string
+print("Outputting to formatted file...")
+for funct in output_functs:
+    # Format hex to uppercase 
+    curr_funct_start = '0x'+funct[0][2:].upper()
+    curr_funct_end = '0x'+funct[1][2:].upper()
+
+    # Format function 
+    curr_funct = "\n    { address = "+curr_funct_start+", size = "+curr_funct_end
+    if debug:
+        curr_funct = curr_funct+", src = "+funct[2]
+    curr_funct = curr_funct+" },"
+
+    # Add to complete output string
+    output_str = output_str+curr_funct
+
+# Delete last comma
+output_str = output_str[:len(output_str)-1]
+
+# Add last bracket
+output_str = output_str+"\n]"
+
+# Output to file
+with open(output_file, "w") as file:
+    file.write(output_str)
+
+
--- a/README.md
+++ b/README.md
@ -190,6 +190,12 @@ functions = [

 You can define function boundaries explicitly using the `functions` property if XenonAnalyse fails to analyze them correctly, for example, with functions containing jump tables.

+You can automatically generate these using the FunctionParser.py script. You will need to create a HTML of your decompiled XEX with IDA using `File -> Produce File -> Create HTML File...` and save the terminal output from running XenonRecomp by appending `> [output log file path]` to the command.
+
+```
+python3 FunctionParser.py [input IDA HTML file path] [input XenonRecomp log file path] [output function list file path]
+```
+
 #### Invalid Instruction Skips

 ```toml
Author	SHA1	Message	Date
Jillian To	8247910df0	Merge `8fc280bed9` into `5a945705de`	2025-07-19 14:14:15 +03:00
JillianTo	8fc280bed9	use raw strings to avoid escape character syntax errors in python 3.12	2025-03-11 15:54:36 -04:00
JillianTo	4452868029	When checking for subroutine, don't make .text have to be at the beginning of the line, this accounts for some weird HTML formatting	2025-03-09 20:53:46 -04:00
JillianTo	9a4dc311c7	fix name in script	2025-03-09 19:12:39 -04:00
JillianTo	444ee2bda1	don't run duplicate check during debug because we allow duplicates during that, rename file	2025-03-09 19:10:49 -04:00
JillianTo	2365f4d697	missing 3 functions, but otherwise correct	2025-03-09 17:56:37 -04:00
JillianTo	21f1a81aa3	missing 3 functions, 0x82F08730 starts at wrong address	2025-03-09 12:53:48 -04:00
JillianTo	6dbbc6ea14	only gets address/size of 0x82893088 and size of 0x82CF7080 wrong	2025-03-08 21:22:26 -05:00
JillianTo	fe3fdbdda5	fixed typo	2025-03-08 13:00:48 -05:00
JillianTo	b18a1a6206	mostly correct	2025-03-08 12:47:45 -05:00