diff --git a/Auto_Function_Parser.py b/Auto_Function_Parser.py new file mode 100644 index 0000000..d86a23e --- /dev/null +++ b/Auto_Function_Parser.py @@ -0,0 +1,297 @@ +## +## Searches for functions in .text that are referenced by functions in .pdata +## +## Input: +## Decompiled code - Created in IDA Pro 9.0SP1 with File -> Produce File -> Create HTML File... +## CLI output from a XenonRecomp run - When trying to compile with XenonRecomp, use > to save the output from the terminal +## +## Output: +## XenonRecomp config - Function block for TOML to be inputted into XenonRecomp +## + +import sys +import re + +# Check if correct number of input arguments were given +if len(sys.argv) != 4: + sys.exit("Auto_Function_Parser.py [IDA HTML] [XenonRecomp log] [Output TOML]") + +# Filepath input arguments +ida_html = sys.argv[1] +xenonrecomp_log = sys.argv[2] +output_file = sys.argv[3] + +# Disable extra debug output +debug = False + +## +## Parse XenonRecomp log +## + +# The starting index of the erroneous switch statement address in the XenonRecomp log +switch_idx = 22 + +# Initialize list to store erroneous switch statement addresses +switch_addrs = [] + +print("Parsing XenonRecomp log...") +# Import each line of XenonRecomp log +with open(xenonrecomp_log, 'r') as file: + # Read each line in the file + for line in file: + # If this line describes an error, it has the address of a problematic switch statement + if re.search('ERROR: Switch case at ', line): + # Save the address as integer + switch_addrs.append(line[switch_idx:switch_idx+8]) + +# Save only unique addresses and sort +switch_addrs = set(switch_addrs) + +## +## Parse IDA HTML +## + +# Initialize list to store start and end of functions +functs = [] + +# Count how many functions have been added +num_functs = 0 + +# Function for adding to function list and incrementing count +def add_function(new_start_addr, prev_end_addr, start_type): + global num_functs + # If an end address for the last added function was specified + if prev_end_addr != None: + # Set end address for last added function + functs[num_functs-1][1] = prev_end_addr + # Add a new function to the list with the specified starting address + functs.append([new_start_addr, 0, [], start_type]) + # Increment the number of functions + num_functs = num_functs+1 + +# Mark if we are in .text section +in_text = False + +# Mark if we should end parsing +end_parse = False + +# Initialize address of last bctr instruction to 0 +bctr_addr = '00000000' + +# Initialize address of last blr instruction to 0 +blr_addr = '00000000' + +# Initialize address of last 'End of function' comment to 0 +eof_addr = '00000000' + +# Initialize address of last restgprlr instruction to 0 +restgprlr_addr = '00000000' + +# Initialize address of last padding to 0 +pad_addr = 0 + +# Import each line of decompiled code +print("Parsing IDA HTML...") +with open(ida_html, 'r') as file: + # Read each line in the file + for line in file: + if not end_parse: + # If in .text + if in_text: + # Get the current address + colon_idx = line.find(':') + curr_addr = line[colon_idx+1:colon_idx+9] + + # Check if this is the start of a function + if re.search(r'\.text:'+curr_addr+' sub_'+curr_addr+': *# [A-Z][A-Z][A-Z][A-Z] XREF:', line): + # Save current address as integer + curr_addr_int = int(curr_addr, 16) + + # If this is not the first function being added + if num_functs > 0: + # If last address had padding or restgprlr instruction, then this function was already added + if curr_addr_int-4 == pad_addr or curr_addr_int-4 == restgprlr_addr: + # Set function type for start address + functs[num_functs-1][3] = 'sub' + else: + # Check if this function is part of latest added function + is_nested_funct = False + nested_functs = functs[num_functs-1][2] + for nested_funct in nested_functs: + if nested_funct == curr_addr: + is_nested_funct = True + break + + # If last address was not padding and not nested in latest function + if not is_nested_funct: + # Add new function and last function's end address + add_function(curr_addr_int, curr_addr_int, 'sub') + else: + # Add new function + add_function(curr_addr_int, None, 'sub') + + # If this is a location + elif re.search(r'^\.text:'+curr_addr+' loc_'+curr_addr, line): + curr_addr_int = int(curr_addr, 16) + curr_funct = functs[num_functs-1] + # If previous address was a blr instruction + if curr_addr_int-4 == int(blr_addr, 16): + # If previous address had an 'End of function' comment or if there was a bctr with the comment + if blr_addr == eof_addr or bctr_addr == eof_addr: + # Find a XREF pointing to a .text address + xref_idx = line.find('XREF: .text:') + if xref_idx > -1: + underscore_idx = line.find('_', xref_idx) + if underscore_idx > -1: + xref = line[underscore_idx+1:underscore_idx+9] + else: + xref = line[xref_idx+12:xref_idx+20] + else: + xref = None + + # Couldn't find XREF pointing to .text address or the XREF is after this address + if xref == None or int(xref, 16) > curr_addr_int: + # Add as new function + add_function(curr_addr_int, curr_addr_int, 'loc') + + else: + # Find address of function that references this + xref_idx = line.find('CODE XREF: sub_') + # If it was found + if xref_idx > -1: + # Store as nested function in latest function + functs[num_functs-1][2].append(line[xref_idx+15:xref_idx+23]) + + # Check if this line is padding + elif num_functs > 0 and re.search(r'\.long 0$', line): + # Convert current address to integer + curr_addr_int = int(curr_addr, 16) + + # Add a new function at the line after padding, and end the current function at this padding address + add_function(curr_addr_int+4, curr_addr_int, None) + + # Save padding address + pad_addr = curr_addr_int + + # Check for blr instruction + elif re.search('blr$', line): + blr_addr = curr_addr + + # Check for 'End of function' comment + elif re.search('End of function ', line): + eof_addr = curr_addr + + # Check for bctr instruction + elif re.search('bctr$', line): + bctr_addr = curr_addr + + # Check for restgprlr instruction + elif re.search('b __restgprlr_[0-9][0-9]$', line): + # Convert current address to integer + curr_addr_int = int(curr_addr, 16) + + # Add a new function at the line after restgprlr instruction, and end the current function at this address + add_function(curr_addr_int+4, curr_addr_int, None) + + restgprlr_addr = curr_addr_int + + # If not in .text + else: + # If .text section header found + if re.search(r'\.section "\.text"', line): + in_text = True + +## +## Find .text functions that are referenced by .pdata functions +## + +# Initialize list for functions that need to be added to toml +output_functs = [] + +# Look for related functions for every unique errored switch statement +print("Searching for needed functions...") +for switch_addr in switch_addrs: + # Start looking at first subroutine + curr_funct_idx = 0 + + # Save current switch statement address as integer + switch_addr_int = int(switch_addr, 16) + + # The related function for this switch statement has not been found yet + search_for_funct = True + + # Start search for function relating to switch statement + while(search_for_funct): + curr_funct = functs[curr_funct_idx] + # If switch address is after this function's start + curr_funct_start = curr_funct[0] + if(switch_addr_int > curr_funct_start): + # If switch address is before this function's end + curr_funct_end = curr_funct[1] + if(switch_addr_int <= curr_funct_end): + # Save current function's start address and the function's length + if debug: + output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start), switch_addr]) + else: + output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start)]) + + # Don't need to continue search for this switch statement + search_for_funct = False + + # Look in next function + curr_funct_idx = curr_funct_idx + 1 + + # Related function was not found + else: + print(f"WARNING: Function relating to {switch_addr} not found! Skipping.") + # Don't need to continue search for this switch statement + search_for_funct = False + +# Remove duplicates +if not debug: + output_functs = list(set(tuple(funct) for funct in output_functs)) + +# Make sure there are no functions with the same starting address but different lengths +if not debug: + for i in range(len(output_functs)): + for j in range(i+1, len(output_functs)): + curr_funct_start = output_functs[i][0] + if curr_funct_start == output_functs[j][0]: + print(f"WARNING: {curr_funct_start} has multiple entries of different lengths, manually find correct one.") + +print(f"{len(output_functs)} functions found!") + +## +## Output all found functions to TOML in correct format +## + +# Create formatted string to export to TOML +output_str = "functions = [" + +# Append all function addresses and lengths to formatted string +print("Outputting to formatted file...") +for funct in output_functs: + # Format hex to uppercase + curr_funct_start = '0x'+funct[0][2:].upper() + curr_funct_end = '0x'+funct[1][2:].upper() + + # Format function + curr_funct = "\n { address = "+curr_funct_start+", size = "+curr_funct_end + if debug: + curr_funct = curr_funct+", src = "+funct[2] + curr_funct = curr_funct+" }," + + # Add to complete output string + output_str = output_str+curr_funct + +# Delete last comma +output_str = output_str[:len(output_str)-1] + +# Add last bracket +output_str = output_str+"\n]" + +# Output to file +with open(output_file, "w") as file: + file.write(output_str) + + diff --git a/README.md b/README.md index 756a0bb..f7108de 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,12 @@ functions = [ You can define function boundaries explicitly using the `functions` property if XenonAnalyse fails to analyze them correctly, for example, with functions containing jump tables. +You can automatically generate these using the FunctionParser.py script. You will need to create a HTML of your decompiled XEX with IDA using `File -> Produce File -> Create HTML File...` and save the terminal output from running XenonRecomp by appending `> [output log file path]` to the command. + +``` +python3 FunctionParser.py [input IDA HTML file path] [input XenonRecomp log file path] [output function list file path] +``` + #### Invalid Instruction Skips ```toml