From bd5f42fb222d5e17b960479382d46e7398982878 Mon Sep 17 00:00:00 2001 From: Matthew Stanley <1379tech@gmail.com> Date: Tue, 28 Apr 2026 20:56:45 -0700 Subject: [PATCH] =?UTF-8?q?analysis:=20discover=5Ffunction=5Fbounds=20?= =?UTF-8?q?=E2=80=94=20real=20CFG=20walk=20with=20jump-table=20support?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a public N64Recomp::discover_function_bounds() in src/analysis.h that performs a BFS-based control-flow walk of a function's body, following: - Conditional branches (target + fall-through) - Unconditional j/jal targets when intra-body - jr $ra returns (block ends after delay slot) - jr-via-jump-table dispatches: the existing register-state simulator from analyze_function detects the lui+addiu+addu+lw+jr pattern and records the jtbl base; we then read entries out of the body bytes and feed targets back into the BFS until convergence. Returns the function's byte size (max-reachable + 4 to cover the delay slot of the last instruction). On failure, populates a specific error message with the offending offset and reason — caller treats this as a build error, NOT a graceful skip (per the project's no-stubs principle). Wires into decompressed.cpp's pattern path, replacing the prior inline BFS that had a TODO for jump-table handling. The pattern caller now propagates failures via `synthesize_decompressed_patterns` returning false, which surfaces in main.cpp's exit_failure path. Concrete behavior change: activating a pattern that includes a fragment with computed jumps now produces a build error pointing at the specific section name + offset + the analyzer's failure reason, instead of silently producing a partial binary. Tested on Stadium's 0x8FF00000 slot — first failing wrapper is at ROM 0x8CC400 with an indirect jr at offset 0x827C the simulator doesn't pattern-match. The static [[input.decompressed_section]] path for fragment78 is unaffected (still recompiles cleanly, no regression on boot logo + PIKA jingle). Future work surfaced by this change: the simulator's lui+addiu +addu+lw+jr pattern doesn't cover every jump-table shape Stadium uses. Each gap surfaces as a specific build-error offset; resolution is to extend analyze_instruction to recognize the additional pattern (or, when it's a true tail-call rather than a jtbl, distinguish those at the jr site). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/analysis.cpp | 255 +++++++++++++++++++++++++++++++++++++++++++ src/analysis.h | 27 +++++ src/decompressed.cpp | 206 ++++++---------------------------- 3 files changed, 313 insertions(+), 175 deletions(-) diff --git a/src/analysis.cpp b/src/analysis.cpp index 9eaaa30..71f3862 100644 --- a/src/analysis.cpp +++ b/src/analysis.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "rabbitizer.hpp" #include "fmt/format.h" @@ -7,6 +8,11 @@ #include "recompiler/context.h" #include "analysis.h" +static uint32_t read_be_u32_local(const uint8_t* p) { + return (uint32_t(p[0]) << 24) | (uint32_t(p[1]) << 16) | + (uint32_t(p[2]) << 8) | uint32_t(p[3]); +} + extern "C" const char* RabbitizerRegister_getNameGpr(uint8_t regValue); // If 64-bit addressing is ever implemented, these will need to be changed to 64-bit values @@ -349,3 +355,252 @@ bool N64Recomp::analyze_function(const N64Recomp::Context& context, const N64Rec return true; } + +// Reads a jump-table's entries out of `body` starting at jtbl_vram, +// stopping at the first entry that doesn't decode to a vram inside +// the body's address range [vram_base, vram_base + bytes_size). Each +// entry that DOES point into the body becomes a destination; offsets +// (vram - vram_base) are pushed into out_targets. +// +// Returns the number of entries collected. Returns 0 if the table +// has no valid entries (caller should treat as an analysis failure). +static size_t read_jump_table_targets( + const uint8_t* body, size_t bytes_size, + uint32_t vram_base, uint32_t jtbl_vram, + std::vector& out_targets) +{ + if (jtbl_vram < vram_base) return 0; + size_t jtbl_off = jtbl_vram - vram_base; + if (jtbl_off >= bytes_size) return 0; + + size_t collected = 0; + while (jtbl_off + 4 <= bytes_size) { + uint32_t entry = read_be_u32_local(body + jtbl_off); + // Entry should be a vram pointing inside the body. Out-of-range + // entry => end of table. + if (entry < vram_base || entry >= vram_base + bytes_size) { + break; + } + size_t target_off = entry - vram_base; + // Targets must be 4-aligned MIPS instructions. + if ((target_off & 0x3u) != 0) break; + out_targets.push_back(target_off); + collected++; + jtbl_off += 4; + } + return collected; +} + +bool N64Recomp::discover_function_bounds( + const uint8_t* body, size_t bytes_size, + uint32_t vram_base, uint32_t entry_offset, + size_t& size_out, std::string& error_out) +{ + using InstrId = rabbitizer::InstrId::UniqueId; + using RegId = rabbitizer::Registers::Cpu::GprO32; + + if (entry_offset + 4 > bytes_size) { + error_out = fmt::format( + "entry_offset 0x{:X} past body end 0x{:X}", + entry_offset, bytes_size); + return false; + } + + // BFS over reachable instruction offsets. visited[off] = true once + // we've decoded the instruction at off. We can revisit offsets if + // they're reached by multiple control-flow paths but only decode + // them once. + std::unordered_set visited; + std::vector worklist; + worklist.push_back(entry_offset); + + size_t max_reached = entry_offset; + + // For each non-jr-$ra `jr ` we encounter, we need to read the + // jump-table entries and add them to the BFS. We do this inline + // by running analyze_instruction across the linear path that + // reached this jr. To keep register state correct per-block, we + // restart per-block scans with fresh register state — this is + // an approximation (real CFG analysis would merge state at joins) + // but works for the lui+addiu+addu+lw+jr jump-table pattern that + // analyze_instruction recognizes, since that pattern is local to + // the basic block containing the jr. + + while (!worklist.empty()) { + size_t off = worklist.back(); + worklist.pop_back(); + if (visited.contains(off)) continue; + + // Per-block scan: walk linearly from off through the basic + // block's terminator, simulating register state as we go. + // Register state is local to this scan — fresh on entry. + RegState reg_states[32]{}; + std::vector stack_states{}; + // Fake Function for analyze_instruction's signature. We only + // need it for fields the analyzer itself reads; section_index + // is consumed only by the jtbl-bounding pass which we don't + // run here. ram_addr-equivalent fields can be passed via the + // real instructions' vrams. + N64Recomp::Function fake_func; + fake_func.section_index = 0; + fake_func.vram = vram_base; + fake_func.rom = 0; + fake_func.words.clear(); + N64Recomp::FunctionStats local_stats; + + size_t cursor = off; + while (cursor + 4 <= bytes_size) { + if (visited.contains(cursor)) { + // Already analyzed this offset — stop linear scan. + break; + } + visited.insert(cursor); + if (cursor > max_reached) max_reached = cursor; + + const uint32_t insn_word = read_be_u32_local(body + cursor); + rabbitizer::InstructionCpu instr( + insn_word, vram_base + uint32_t(cursor)); + const auto id = instr.getUniqueId(); + + // Update register state via the existing simulator. This + // tracks lui/addiu/addu/lw chains so when we hit a jr + // the simulator already has the jump-table base in + // local_stats.jump_tables. + // + // analyze_instruction returns false on analyzer-level + // problems (e.g. negative stack offsets) — that's a real + // bug we shouldn't paper over. + if (!analyze_instruction(instr, fake_func, local_stats, + reg_states, stack_states, + /*is_got_addr_defined=*/false)) { + error_out = fmt::format( + "analyze_instruction rejected insn 0x{:08X} at " + "offset 0x{:X} (vram 0x{:08X})", + insn_word, cursor, vram_base + uint32_t(cursor)); + return false; + } + + // jr $ra: function return — block ends after delay slot. + if (id == InstrId::cpu_jr) { + int rs = int(instr.GetO32_rs()); + // Delay slot is reachable. + size_t delay = cursor + 4; + if (delay + 4 <= bytes_size) { + visited.insert(delay); + if (delay > max_reached) max_reached = delay; + // Don't recurse into the delay slot's instruction — + // it's a single insn that runs in the shadow of + // the jr. Just mark it visited. + } + if (rs == int(RegId::GPR_O32_ra)) { + // jr $ra — return. + break; + } + // jr — jump table OR computed tail call. + // analyze_instruction recorded a JumpTable entry in + // local_stats if the lui+addiu+addu+lw pattern lined + // up. If we have one, read its entries from body + // bytes and add to BFS worklist. + if (local_stats.jump_tables.empty()) { + error_out = fmt::format( + "indirect jr at offset 0x{:X} (vram 0x{:08X}) — " + "register-state simulator did NOT detect a " + "jump-table pattern. May be a tail call or " + "an analysis gap. Cannot bound this function.", + cursor, vram_base + uint32_t(cursor)); + return false; + } + // The most recently appended jump table corresponds to + // this jr. Read its entries from the body bytes. + const N64Recomp::JumpTable& jtbl = + local_stats.jump_tables.back(); + std::vector jtbl_targets; + size_t collected = read_jump_table_targets( + body, bytes_size, vram_base, jtbl.vram, + jtbl_targets); + if (collected == 0) { + error_out = fmt::format( + "indirect jr at offset 0x{:X} — jump table " + "at vram 0x{:08X} has no valid entries " + "(first entry would point outside body)", + cursor, jtbl.vram); + return false; + } + // Add each target to BFS. Also extend max_reached past + // the table itself so we count its bytes as part of + // the function. + for (size_t t : jtbl_targets) { + if (!visited.contains(t)) { + worklist.push_back(t); + } + } + size_t jtbl_end = (jtbl.vram - vram_base) + + collected * 4; + if (jtbl_end > 0) { + if (jtbl_end - 4 > max_reached) { + max_reached = jtbl_end - 4; + } + } + break; // block ends after the jr's delay slot + } + + // J / JAL (unconditional branch with delay slot). + if (id == InstrId::cpu_j || id == InstrId::cpu_jal) { + size_t delay = cursor + 4; + if (delay + 4 <= bytes_size) { + visited.insert(delay); + if (delay > max_reached) max_reached = delay; + } + if (instr.hasOperandAlias( + rabbitizer::OperandType::cpu_label)) { + uint32_t target_vram = instr.getInstrIndexAsVram(); + if (target_vram >= vram_base && + target_vram < vram_base + bytes_size) { + size_t target_off = target_vram - vram_base; + if (!visited.contains(target_off)) { + worklist.push_back(target_off); + } + } + } + if (id == InstrId::cpu_jal) { + cursor = delay + 4; + continue; + } + break; // unconditional j ends the block + } + + // Conditional branches: target + fall-through reachable. + if (instr.isBranch()) { + size_t delay = cursor + 4; + if (delay + 4 <= bytes_size) { + visited.insert(delay); + if (delay > max_reached) max_reached = delay; + } + if (instr.hasOperandAlias( + rabbitizer::OperandType::cpu_branch_target_label)) { + uint32_t target_vram = instr.getBranchVramGeneric(); + if (target_vram >= vram_base && + target_vram < vram_base + bytes_size) { + size_t target_off = target_vram - vram_base; + if (!visited.contains(target_off)) { + worklist.push_back(target_off); + } + } + } + cursor = delay + 4; + continue; + } + + cursor += 4; + } + } + + size_t end_off = max_reached + 4; + if (end_off > bytes_size) end_off = bytes_size; + if (end_off <= entry_offset) { + error_out = "no reachable instructions found at entry"; + return false; + } + size_out = end_off - entry_offset; + return true; +} diff --git a/src/analysis.h b/src/analysis.h index 9e0562e..9d63cde 100644 --- a/src/analysis.h +++ b/src/analysis.h @@ -2,6 +2,7 @@ #define __RECOMP_ANALYSIS_H__ #include +#include #include #include "recompiler/context.h" @@ -19,6 +20,32 @@ namespace N64Recomp { }; bool analyze_function(const Context& context, const Function& function, const std::vector& instructions, FunctionStats& stats); + + // Discover the byte-size of a function whose entry sits at + // `entry_offset` within `body`. Performs a BFS-based control-flow + // walk that follows conditional branches (target + fall-through), + // unconditional j/jal targets when intra-body, jr $ra returns, + // and jr-via-jump-table dispatches (resolved by the existing + // register-state simulator from analyze_function). + // + // `body` is the raw decompressed bytes of the section's body in + // big-endian instruction layout (same shape as Function::words but + // as a byte buffer; bytes_size is the upper bound). + // + // `vram_base` is the link-time vram of body[0] — used to translate + // branch/jal targets back to body offsets. + // + // On success, sets `size_out` to the function's byte size (always + // a multiple of 4) and returns true. + // + // On failure, populates `error_out` with a specific message + // identifying the offending instruction or jump-table issue, and + // returns false. Per the project's no-stubs principle, callers + // should treat false as a build-time error, NOT a graceful skip. + bool discover_function_bounds( + const uint8_t* body, size_t bytes_size, + uint32_t vram_base, uint32_t entry_offset, + size_t& size_out, std::string& error_out); } #endif \ No newline at end of file diff --git a/src/decompressed.cpp b/src/decompressed.cpp index 7e0a75b..a2e5503 100644 --- a/src/decompressed.cpp +++ b/src/decompressed.cpp @@ -11,6 +11,7 @@ #include "fmt/format.h" #include "rabbitizer.hpp" #include +#include "analysis.h" namespace N64Recomp { @@ -552,188 +553,43 @@ size_t add_decompressed_section(Context& context, std::move(entry_words), section_name + "_entry"); - // (2) Implementation function at vram+0x20. Determine its size via - // a real CFG walk: BFS over reachable instructions following - // conditional and unconditional branches, J/JAL targets, and - // jr-via-jump-table targets (resolved by reading the jtbl entries - // out of the body bytes). The function size is max-reachable-offset - // + 4 for the delay slot. - // - // This is honest control-flow analysis — no "scan to first jr ra" - // shortcut, no skip-on-failure. If we can't determine bounds for - // a fragment cleanly, we abort the build with the section name - // and the offending instruction; user can either (a) add an - // explicit bounds override in the toml, or (b) report a recompiler - // bug. Stubbing a function via ignored=true is forbidden per the - // project's "no stubs in C/C++" principle. + // (2) Implementation function at vram+0x20. The engine's + // analysis.cpp::discover_function_bounds runs a real BFS-based + // control-flow walk that follows conditional branches, j/jal + // targets, and jr-via-jump-table dispatches (resolved using the + // existing register-state simulator). On failure it reports a + // specific offset and reason; we propagate that as a build error + // — no graceful skip, no stub. constexpr uint32_t IMPL_OFFSET = 0x20; - - auto discover_impl_size = [&](size_t& impl_size_out, - std::string& err_out) -> bool { - const size_t body_end = reloc_offset; - if (body_end <= IMPL_OFFSET + 4) { - err_out = "body too small to contain a function at +0x20"; - return false; - } - // BFS worklist of insn offsets to visit. visited holds every - // offset whose instruction has been decoded. - std::set visited; - std::vector worklist; - worklist.push_back(IMPL_OFFSET); - - size_t max_reached = IMPL_OFFSET; - - // For non-jr-$ra encountered, we may need jump-table analysis. - // Defer those to a second pass after BFS so jtbl reads happen - // once per detected jr. - std::vector indirect_jrs; - - while (!worklist.empty()) { - size_t off = worklist.back(); - worklist.pop_back(); - if (off + 4 > body_end) { - err_out = fmt::format( - "BFS reached offset 0x{:X}, past body end 0x{:X}", - off, body_end); - return false; - } - if (visited.contains(off)) continue; - - // Walk linearly from this offset, marking visited, until - // we hit a control-flow boundary that ends the basic block. - while (off + 4 <= body_end) { - if (visited.contains(off)) break; - visited.insert(off); - if (off > max_reached) max_reached = off; - - const uint32_t insn_word = read_be_u32(blob.data() + off); - rabbitizer::InstructionCpu instr(insn_word, vram + uint32_t(off)); - const auto id = instr.getUniqueId(); - - using InstrId = rabbitizer::InstrId::UniqueId; - - // jr $ra: function return — block ends after delay slot. - if (id == InstrId::cpu_jr) { - int rs = int(instr.GetO32_rs()); - // Delay slot is reachable. - size_t delay = off + 4; - if (delay + 4 <= body_end) { - visited.insert(delay); - if (delay > max_reached) max_reached = delay; - } - if (rs == int(rabbitizer::Registers::Cpu::GprO32::GPR_O32_ra)) { - // jr $ra — return. Block ends. - break; - } - // jr — likely a jump-table dispatch - // OR a tail call. Defer to second pass. - indirect_jrs.push_back(off); - break; - } - - // J / JAL (unconditional branch with delay slot). - if (id == InstrId::cpu_j || id == InstrId::cpu_jal) { - // Delay slot is reachable. - size_t delay = off + 4; - if (delay + 4 <= body_end) { - visited.insert(delay); - if (delay > max_reached) max_reached = delay; - } - // J target: continue control flow there if it's - // inside our function body (else it's a tail - // call / cross-fragment dispatch). - if (instr.hasOperandAlias(rabbitizer::OperandType::cpu_label)) { - uint32_t target_vram = instr.getInstrIndexAsVram(); - if (target_vram >= vram + IMPL_OFFSET && - target_vram < vram + body_end) { - size_t target_off = target_vram - vram; - if (!visited.contains(target_off)) { - worklist.push_back(target_off); - } - } - } - // JAL = call: control returns after delay slot. J - // = unconditional jump: block ends. - if (id == InstrId::cpu_jal) { - off = delay + 4; - continue; - } - break; - } - - // Conditional branches (B*): 16-bit signed offset - // relative to delay slot. Both target and fall-through - // are reachable. - if (instr.isBranch()) { - size_t delay = off + 4; - if (delay + 4 <= body_end) { - visited.insert(delay); - if (delay > max_reached) max_reached = delay; - } - if (instr.hasOperandAlias(rabbitizer::OperandType::cpu_branch_target_label)) { - uint32_t target_vram = instr.getBranchVramGeneric(); - if (target_vram >= vram + IMPL_OFFSET && - target_vram < vram + body_end) { - size_t target_off = target_vram - vram; - if (!visited.contains(target_off)) { - worklist.push_back(target_off); - } - } - } - // Fall-through after delay slot is also reachable. - off = delay + 4; - continue; - } - - // Default: fall through to next instruction. - off += 4; - } - } - - // Second pass: indirect jr (jr not jr $ra) means a - // jump table. The recompiler's existing analyze_function - // detects these by simulating lui+addiu+lw+jr register-state - // chains; wiring that simulator into decompressed.cpp's bounds - // discovery is meaningful work that hasn't been done yet. - // Until it is, an indirect jr is a build-time error (NOT a - // skip) so the user has to make an explicit choice rather - // than ship a binary with missing bodies. - if (!indirect_jrs.empty()) { - err_out = fmt::format( - "indirect jr at +0x{:X} (likely jump table); " - "decompressed-section pattern can't yet bound functions " - "with computed jumps. Declare via the single-block " - "[[input.decompressed_section]] form to bypass, or " - "extend decompressed.cpp's CFG walk to follow " - "jump-table targets.", - indirect_jrs.front()); - return false; - } - - // Function size = max_reached + 4 (covers delay slot of last - // visited insn). - size_t end_off = max_reached + 4; - if (end_off > body_end) end_off = body_end; - if (end_off <= IMPL_OFFSET) { - err_out = "no reachable instructions found at +0x20"; - return false; - } - impl_size_out = end_off - IMPL_OFFSET; - return true; - }; + if (reloc_offset <= IMPL_OFFSET + 4) { + std::fprintf(stderr, + "decompressed: section %s — body too small to contain a " + "function at +0x20 (reloc_offset=0x%X)\n", + section_name.c_str(), reloc_offset); + return size_t(-1); + } size_t impl_size = 0; std::string discover_err; - if (!discover_impl_size(impl_size, discover_err)) { + bool ok = discover_function_bounds( + blob.data(), reloc_offset, + vram, IMPL_OFFSET, + impl_size, discover_err); + if (!ok) { std::fprintf(stderr, "decompressed: section %s — function-bounds discovery " "failed: %s\n" - " This fragment's impl function couldn't be bounded by\n" - " the engine's CFG walk. Either teach decompressed.cpp\n" - " to handle this shape, declare the fragment via the\n" - " single-block [[input.decompressed_section]] form (with\n" - " manual analysis), or skip it explicitly via a future\n" - " pattern.exclude config option.\n", + " Build aborted. Resolutions, in order of preference:\n" + " 1. If this is a recompiler analysis gap, fix the\n" + " analyzer in src/analysis.cpp.\n" + " 2. If the fragment legitimately has a shape the\n" + " analyzer can't handle, declare it via the\n" + " single-block [[input.decompressed_section]] form\n" + " (manual analysis path).\n" + " 3. If the wrapper is unused / unreachable in this\n" + " game's runtime path, exclude it via a future\n" + " pattern.exclude config field.\n" + " No graceful skip, no stub. Build refuses to ship.\n", section_name.c_str(), discover_err.c_str()); return size_t(-1); }