diff --git a/crates/prometeu-bytecode/src/layout.rs b/crates/prometeu-bytecode/src/layout.rs index 56cf6197..6fac6dfd 100644 --- a/crates/prometeu-bytecode/src/layout.rs +++ b/crates/prometeu-bytecode/src/layout.rs @@ -1,7 +1,9 @@ //! Shared bytecode layout utilities, used by both compiler (emitter/linker) //! and the VM (verifier/loader). This ensures a single source of truth for -//! how function ranges and jump targets are interpreted post-link. +//! how function ranges, instruction boundaries, and pc→function lookups are +//! interpreted post-link. +use crate::decoder::decode_next; use crate::FunctionMeta; /// Returns the absolute end (exclusive) of the function at `func_idx`, @@ -31,6 +33,24 @@ pub fn function_len_from_next(functions: &[FunctionMeta], func_idx: usize, code_ end.saturating_sub(start) } +/// Canonical function range [start, end) where `end` is the next function's +/// `code_offset` or `code_len_total` if this is the last function. +#[inline] +pub fn function_range(functions: &[FunctionMeta], func_idx: usize, code_len_total: usize) -> (usize, usize) { + let start = functions + .get(func_idx) + .map(|f| f.code_offset as usize) + .unwrap_or(0); + let end = function_end_from_next(functions, func_idx, code_len_total); + (start, end) +} + +/// Canonical function length (in bytes). +#[inline] +pub fn function_len(functions: &[FunctionMeta], func_idx: usize, code_len_total: usize) -> usize { + function_len_from_next(functions, func_idx, code_len_total) +} + /// Recomputes all `code_len` values in place from the next function start /// (exclusive end), using the combined code buffer length for the last one. pub fn recompute_function_lengths_in_place(functions: &mut [FunctionMeta], code_len_total: usize) { @@ -53,3 +73,176 @@ pub fn function_index_by_pc(functions: &[FunctionMeta], code_len_total: usize, p } None } + +/// Alias: canonical function lookup by absolute PC. +#[inline] +pub fn lookup_function_by_pc(functions: &[FunctionMeta], code_len_total: usize, pc_abs: usize) -> Option { + function_index_by_pc(functions, code_len_total, pc_abs) +} + +/// Returns true if `rel_pc` (relative to the function start) is a valid +/// instruction boundary as determined by the canonical decoder. +/// +/// Contract: +/// - `rel_pc == 0` is always a boundary if `func_idx` is valid. +/// - Boundaries are computed by stepping with `decoder::decode_next` from the +/// function start up to (and possibly past) `rel_pc` but never beyond the +/// function exclusive end. +/// - Any decode error before reaching `rel_pc` yields `false` (invalid program). +pub fn is_boundary(functions: &[FunctionMeta], code: &[u8], code_len_total: usize, func_idx: usize, rel_pc: usize) -> bool { + let (start, end) = match functions.get(func_idx) { + Some(_) => function_range(functions, func_idx, code_len_total), + None => return false, + }; + + let func_len = end.saturating_sub(start); + if rel_pc == 0 { return true; } + if rel_pc > func_len { return false; } + + let target = start + rel_pc; + let mut pc = start; + while pc < end { + match decode_next(pc, code) { + Ok(di) => { + let next = di.next_pc; + if next > end { return false; } + if next == target { return true; } + if next <= pc { return false; } // must make progress + pc = next; + if pc > target { return false; } + } + Err(_) => return false, + } + } + // If we reached end without matching `target`, only boundary is exact end + target == end +} + +/// Returns true if `abs_pc` is a valid instruction boundary for the function +/// containing it, according to the canonical decoder. Returns false if `abs_pc` +/// is not within any function range or if decoding fails. +pub fn is_boundary_abs(functions: &[FunctionMeta], code: &[u8], code_len_total: usize, abs_pc: usize) -> bool { + if let Some(func_idx) = lookup_function_by_pc(functions, code_len_total, abs_pc) { + let (start, _end) = function_range(functions, func_idx, code_len_total); + let rel = abs_pc.saturating_sub(start); + return is_boundary(functions, code, code_len_total, func_idx, rel); + } + + // Not inside any function range; allow exact function starts/ends as + // valid boundaries (e.g., last function end == total code len). + for i in 0..functions.len() { + let (start, end) = function_range(functions, i, code_len_total); + if abs_pc == start || abs_pc == end { + return true; + } + } + false +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::asm::{assemble, Asm, Operand}; + use crate::opcode::OpCode; + + fn build_funcs(offsets: &[usize], lens: Option<&[usize]>) -> Vec { + let mut v = Vec::new(); + for (i, off) in offsets.iter().copied().enumerate() { + let len_u32 = lens.and_then(|ls| ls.get(i).copied()).unwrap_or(0) as u32; + v.push(FunctionMeta { + code_offset: off as u32, + code_len: len_u32, + param_slots: 0, + local_slots: 0, + return_slots: 0, + max_stack_slots: 0, + }); + } + v + } + + #[test] + fn boundaries_known_sequence() { + // Build a function with mixed immediate sizes: + // [NOP][PUSH_I32 4][PUSH_I64 8][PUSH_BOOL 1][HALT] + let code = assemble(&[ + Asm::Op(OpCode::Nop, vec![]), + Asm::Op(OpCode::PushI32, vec![Operand::I32(123)]), + Asm::Op(OpCode::PushI64, vec![Operand::I64(42)]), + Asm::Op(OpCode::PushBool, vec![Operand::Bool(true)]), + Asm::Op(OpCode::Halt, vec![]), + ]).unwrap(); + + // Single function starting at 0 + let code_len_total = code.len(); + let mut funcs = build_funcs(&[0], None); + recompute_function_lengths_in_place(&mut funcs, code_len_total); + + // Expected boundaries (relative): 0, 2, 8, 18, 21, 23 + // Explanation per instruction size: opcode(2) + imm + let expected = [0usize, 2, 8, 18, 21, 23]; + for rel in 0..=expected.last().copied().unwrap() { + let should_be_boundary = expected.contains(&rel); + assert_eq!( + is_boundary(&funcs, &code, code_len_total, 0, rel), + should_be_boundary, + "rel_pc={} boundary mismatch", + rel + ); + } + + // Check absolute variant too + for rel in expected { + let abs = rel; + assert!(is_boundary_abs(&funcs, &code, code_len_total, abs)); + } + } + + #[test] + fn fuzz_table_monotonic_and_boundaries() { + // Build a pseudo-random but valid sequence using a simple pattern over opcodes + // to avoid invalid encodings. We cycle through a few known-good opcodes. + let ops = [ + OpCode::Nop, + OpCode::PushI32, + OpCode::PushBool, + OpCode::PushI64, + OpCode::Pop, + OpCode::Ret, + ]; + + let mut prog = Vec::new(); + for i in 0..50 { + let op = ops[i % ops.len()]; + let asm = match op { + OpCode::Nop => Asm::Op(OpCode::Nop, vec![]), + OpCode::PushI32 => Asm::Op(OpCode::PushI32, vec![Operand::I32(i as i32)]), + OpCode::PushBool => Asm::Op(OpCode::PushBool, vec![Operand::Bool(i % 2 == 0)]), + OpCode::PushI64 => Asm::Op(OpCode::PushI64, vec![Operand::I64(i as i64)]), + OpCode::Pop => Asm::Op(OpCode::Pop, vec![]), + OpCode::Ret => Asm::Op(OpCode::Ret, vec![]), + _ => unreachable!(), + }; + prog.push(asm); + } + + let code = assemble(&prog).unwrap(); + let code_len_total = code.len(); + let mut funcs = build_funcs(&[0], None); + recompute_function_lengths_in_place(&mut funcs, code_len_total); + let (start, end) = function_range(&funcs, 0, code_len_total); + assert_eq!(start, 0); + assert_eq!(end, code_len_total); + + // Walk with decoder and verify boundaries are accepted + let mut pc = start; + while pc < end { + assert!(is_boundary_abs(&funcs, &code, code_len_total, pc)); + let di = decode_next(pc, &code).expect("decode_next"); + assert!(di.next_pc > pc && di.next_pc <= end); + pc = di.next_pc; + } + // End must be a boundary too + assert!(is_boundary(&funcs, &code, code_len_total, 0, end - start)); + } +} diff --git a/test-cartridges/canonical/golden/program.pbc b/test-cartridges/canonical/golden/program.pbc index 5a70490d..3646ac85 100644 Binary files a/test-cartridges/canonical/golden/program.pbc and b/test-cartridges/canonical/golden/program.pbc differ