Nilton Constantino 239d7251c3
pr 51
2026-01-31 23:49:24 +00:00

713 lines
25 KiB
Rust

pub mod linker;
use crate::opcode::OpCode;
use crate::abi::SourceSpan;
/// An entry in the Constant Pool.
///
/// The Constant Pool is a table of unique values used by the program.
/// Instead of embedding large data (like strings) directly in the instruction stream,
/// the bytecode uses `PushConst <index>` to load these values onto the stack.
#[derive(Debug, Clone, PartialEq)]
pub enum ConstantPoolEntry {
/// Reserved index (0). Represents a null/undefined value.
Null,
/// A 64-bit integer constant.
Int64(i64),
/// A 64-bit floating point constant.
Float64(f64),
/// A boolean constant.
Boolean(bool),
/// A UTF-8 string constant.
String(String),
/// A 32-bit integer constant.
Int32(i32),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum LoadError {
InvalidMagic,
InvalidVersion,
InvalidEndianness,
OverlappingSections,
SectionOutOfBounds,
InvalidOpcode,
InvalidConstIndex,
InvalidFunctionIndex,
MalformedHeader,
MalformedSection,
UnexpectedEof,
}
#[derive(Debug, Clone, Default, PartialEq)]
pub struct FunctionMeta {
pub code_offset: u32,
pub code_len: u32,
pub param_slots: u16,
pub local_slots: u16,
pub return_slots: u16,
pub max_stack_slots: u16,
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct DebugInfo {
pub pc_to_span: Vec<(u32, SourceSpan)>, // Sorted by PC
pub function_names: Vec<(u32, String)>, // (func_idx, name)
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Export {
pub symbol: String,
pub func_idx: u32,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Import {
pub symbol: String,
pub relocation_pcs: Vec<u32>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct BytecodeModule {
pub version: u16,
pub const_pool: Vec<ConstantPoolEntry>,
pub functions: Vec<FunctionMeta>,
pub code: Vec<u8>,
pub debug_info: Option<DebugInfo>,
pub exports: Vec<Export>,
pub imports: Vec<Import>,
}
impl BytecodeModule {
pub fn serialize(&self) -> Vec<u8> {
let cp_data = self.serialize_const_pool();
let func_data = self.serialize_functions();
let code_data = self.code.clone();
let debug_data = self.debug_info.as_ref().map(|di| self.serialize_debug(di)).unwrap_or_default();
let export_data = self.serialize_exports();
let import_data = self.serialize_imports();
let mut final_sections = Vec::new();
if !cp_data.is_empty() { final_sections.push((0, cp_data)); }
if !func_data.is_empty() { final_sections.push((1, func_data)); }
if !code_data.is_empty() { final_sections.push((2, code_data)); }
if !debug_data.is_empty() { final_sections.push((3, debug_data)); }
if !export_data.is_empty() { final_sections.push((4, export_data)); }
if !import_data.is_empty() { final_sections.push((5, import_data)); }
let mut out = Vec::new();
// Magic "PBS\0"
out.extend_from_slice(b"PBS\0");
// Version 0
out.extend_from_slice(&0u16.to_le_bytes());
// Endianness 0 (Little Endian), Reserved
out.extend_from_slice(&[0u8, 0u8]);
// section_count
out.extend_from_slice(&(final_sections.len() as u32).to_le_bytes());
// padding to 32 bytes
out.extend_from_slice(&[0u8; 20]);
let mut current_offset = 32 + (final_sections.len() as u32 * 12);
// Write section table
for (kind, data) in &final_sections {
let k: u32 = *kind;
out.extend_from_slice(&k.to_le_bytes());
out.extend_from_slice(&current_offset.to_le_bytes());
out.extend_from_slice(&(data.len() as u32).to_le_bytes());
current_offset += data.len() as u32;
}
// Write section data
for (_, data) in final_sections {
out.extend_from_slice(&data);
}
out
}
fn serialize_const_pool(&self) -> Vec<u8> {
if self.const_pool.is_empty() { return Vec::new(); }
let mut data = Vec::new();
data.extend_from_slice(&(self.const_pool.len() as u32).to_le_bytes());
for entry in &self.const_pool {
match entry {
ConstantPoolEntry::Null => data.push(0),
ConstantPoolEntry::Int64(v) => {
data.push(1);
data.extend_from_slice(&v.to_le_bytes());
}
ConstantPoolEntry::Float64(v) => {
data.push(2);
data.extend_from_slice(&v.to_le_bytes());
}
ConstantPoolEntry::Boolean(v) => {
data.push(3);
data.push(if *v { 1 } else { 0 });
}
ConstantPoolEntry::String(v) => {
data.push(4);
let s_bytes = v.as_bytes();
data.extend_from_slice(&(s_bytes.len() as u32).to_le_bytes());
data.extend_from_slice(s_bytes);
}
ConstantPoolEntry::Int32(v) => {
data.push(5);
data.extend_from_slice(&v.to_le_bytes());
}
}
}
data
}
fn serialize_functions(&self) -> Vec<u8> {
if self.functions.is_empty() { return Vec::new(); }
let mut data = Vec::new();
data.extend_from_slice(&(self.functions.len() as u32).to_le_bytes());
for f in &self.functions {
data.extend_from_slice(&f.code_offset.to_le_bytes());
data.extend_from_slice(&f.code_len.to_le_bytes());
data.extend_from_slice(&f.param_slots.to_le_bytes());
data.extend_from_slice(&f.local_slots.to_le_bytes());
data.extend_from_slice(&f.return_slots.to_le_bytes());
data.extend_from_slice(&f.max_stack_slots.to_le_bytes());
}
data
}
fn serialize_debug(&self, di: &DebugInfo) -> Vec<u8> {
let mut data = Vec::new();
data.extend_from_slice(&(di.pc_to_span.len() as u32).to_le_bytes());
for (pc, span) in &di.pc_to_span {
data.extend_from_slice(&pc.to_le_bytes());
data.extend_from_slice(&span.file_id.to_le_bytes());
data.extend_from_slice(&span.start.to_le_bytes());
data.extend_from_slice(&span.end.to_le_bytes());
}
data.extend_from_slice(&(di.function_names.len() as u32).to_le_bytes());
for (idx, name) in &di.function_names {
data.extend_from_slice(&idx.to_le_bytes());
let n_bytes = name.as_bytes();
data.extend_from_slice(&(n_bytes.len() as u32).to_le_bytes());
data.extend_from_slice(n_bytes);
}
data
}
fn serialize_exports(&self) -> Vec<u8> {
if self.exports.is_empty() { return Vec::new(); }
let mut data = Vec::new();
data.extend_from_slice(&(self.exports.len() as u32).to_le_bytes());
for exp in &self.exports {
data.extend_from_slice(&exp.func_idx.to_le_bytes());
let s_bytes = exp.symbol.as_bytes();
data.extend_from_slice(&(s_bytes.len() as u32).to_le_bytes());
data.extend_from_slice(s_bytes);
}
data
}
fn serialize_imports(&self) -> Vec<u8> {
if self.imports.is_empty() { return Vec::new(); }
let mut data = Vec::new();
data.extend_from_slice(&(self.imports.len() as u32).to_le_bytes());
for imp in &self.imports {
let s_bytes = imp.symbol.as_bytes();
data.extend_from_slice(&(s_bytes.len() as u32).to_le_bytes());
data.extend_from_slice(s_bytes);
data.extend_from_slice(&(imp.relocation_pcs.len() as u32).to_le_bytes());
for pc in &imp.relocation_pcs {
data.extend_from_slice(&pc.to_le_bytes());
}
}
data
}
}
pub struct BytecodeLoader;
impl BytecodeLoader {
pub fn load(bytes: &[u8]) -> Result<BytecodeModule, LoadError> {
if bytes.len() < 32 {
return Err(LoadError::UnexpectedEof);
}
// Magic "PBS\0"
if &bytes[0..4] != b"PBS\0" {
return Err(LoadError::InvalidMagic);
}
let version = u16::from_le_bytes([bytes[4], bytes[5]]);
if version != 0 {
return Err(LoadError::InvalidVersion);
}
let endianness = bytes[6];
if endianness != 0 { // 0 = Little Endian
return Err(LoadError::InvalidEndianness);
}
let section_count = u32::from_le_bytes([bytes[8], bytes[9], bytes[10], bytes[11]]);
let mut sections = Vec::new();
let mut pos = 32;
for _ in 0..section_count {
if pos + 12 > bytes.len() {
return Err(LoadError::UnexpectedEof);
}
let kind = u32::from_le_bytes([bytes[pos], bytes[pos+1], bytes[pos+2], bytes[pos+3]]);
let offset = u32::from_le_bytes([bytes[pos+4], bytes[pos+5], bytes[pos+6], bytes[pos+7]]);
let length = u32::from_le_bytes([bytes[pos+8], bytes[pos+9], bytes[pos+10], bytes[pos+11]]);
// Basic bounds check
if (offset as usize) + (length as usize) > bytes.len() {
return Err(LoadError::SectionOutOfBounds);
}
sections.push((kind, offset, length));
pos += 12;
}
// Check for overlapping sections
for i in 0..sections.len() {
for j in i + 1..sections.len() {
let (_, o1, l1) = sections[i];
let (_, o2, l2) = sections[j];
if (o1 < o2 + l2) && (o2 < o1 + l1) {
return Err(LoadError::OverlappingSections);
}
}
}
let mut module = BytecodeModule {
version,
const_pool: Vec::new(),
functions: Vec::new(),
code: Vec::new(),
debug_info: None,
exports: Vec::new(),
imports: Vec::new(),
};
for (kind, offset, length) in sections {
let section_data = &bytes[offset as usize..(offset + length) as usize];
match kind {
0 => { // Const Pool
module.const_pool = parse_const_pool(section_data)?;
}
1 => { // Functions
module.functions = parse_functions(section_data)?;
}
2 => { // Code
module.code = section_data.to_vec();
}
3 => { // Debug Info
module.debug_info = Some(parse_debug_section(section_data)?);
}
4 => { // Exports
module.exports = parse_exports(section_data)?;
}
5 => { // Imports
module.imports = parse_imports(section_data)?;
}
_ => {} // Skip unknown or optional sections
}
}
// Additional validations
validate_module(&module)?;
Ok(module)
}
}
fn parse_const_pool(data: &[u8]) -> Result<Vec<ConstantPoolEntry>, LoadError> {
if data.is_empty() {
return Ok(Vec::new());
}
if data.len() < 4 {
return Err(LoadError::MalformedSection);
}
let count = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
let mut cp = Vec::with_capacity(count);
let mut pos = 4;
for _ in 0..count {
if pos >= data.len() {
return Err(LoadError::UnexpectedEof);
}
let tag = data[pos];
pos += 1;
match tag {
0 => cp.push(ConstantPoolEntry::Null),
1 => { // Int64
if pos + 8 > data.len() { return Err(LoadError::UnexpectedEof); }
let val = i64::from_le_bytes(data[pos..pos+8].try_into().unwrap());
cp.push(ConstantPoolEntry::Int64(val));
pos += 8;
}
2 => { // Float64
if pos + 8 > data.len() { return Err(LoadError::UnexpectedEof); }
let val = f64::from_le_bytes(data[pos..pos+8].try_into().unwrap());
cp.push(ConstantPoolEntry::Float64(val));
pos += 8;
}
3 => { // Boolean
if pos >= data.len() { return Err(LoadError::UnexpectedEof); }
cp.push(ConstantPoolEntry::Boolean(data[pos] != 0));
pos += 1;
}
4 => { // String
if pos + 4 > data.len() { return Err(LoadError::UnexpectedEof); }
let len = u32::from_le_bytes(data[pos..pos+4].try_into().unwrap()) as usize;
pos += 4;
if pos + len > data.len() { return Err(LoadError::UnexpectedEof); }
let s = String::from_utf8_lossy(&data[pos..pos+len]).into_owned();
cp.push(ConstantPoolEntry::String(s));
pos += len;
}
5 => { // Int32
if pos + 4 > data.len() { return Err(LoadError::UnexpectedEof); }
let val = i32::from_le_bytes(data[pos..pos+4].try_into().unwrap());
cp.push(ConstantPoolEntry::Int32(val));
pos += 4;
}
_ => return Err(LoadError::MalformedSection),
}
}
Ok(cp)
}
fn parse_functions(data: &[u8]) -> Result<Vec<FunctionMeta>, LoadError> {
if data.is_empty() {
return Ok(Vec::new());
}
if data.len() < 4 {
return Err(LoadError::MalformedSection);
}
let count = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
let mut functions = Vec::with_capacity(count);
let mut pos = 4;
for _ in 0..count {
if pos + 16 > data.len() {
return Err(LoadError::UnexpectedEof);
}
let code_offset = u32::from_le_bytes(data[pos..pos+4].try_into().unwrap());
let code_len = u32::from_le_bytes(data[pos+4..pos+8].try_into().unwrap());
let param_slots = u16::from_le_bytes(data[pos+8..pos+10].try_into().unwrap());
let local_slots = u16::from_le_bytes(data[pos+10..pos+12].try_into().unwrap());
let return_slots = u16::from_le_bytes(data[pos+12..pos+14].try_into().unwrap());
let max_stack_slots = u16::from_le_bytes(data[pos+14..pos+16].try_into().unwrap());
functions.push(FunctionMeta {
code_offset,
code_len,
param_slots,
local_slots,
return_slots,
max_stack_slots,
});
pos += 16;
}
Ok(functions)
}
fn parse_debug_section(data: &[u8]) -> Result<DebugInfo, LoadError> {
if data.is_empty() {
return Ok(DebugInfo::default());
}
if data.len() < 8 {
return Err(LoadError::MalformedSection);
}
let mut pos = 0;
// PC to Span table
let span_count = u32::from_le_bytes(data[pos..pos+4].try_into().unwrap()) as usize;
pos += 4;
let mut pc_to_span = Vec::with_capacity(span_count);
for _ in 0..span_count {
if pos + 16 > data.len() {
return Err(LoadError::UnexpectedEof);
}
let pc = u32::from_le_bytes(data[pos..pos+4].try_into().unwrap());
let file_id = u32::from_le_bytes(data[pos+4..pos+8].try_into().unwrap());
let start = u32::from_le_bytes(data[pos+8..pos+12].try_into().unwrap());
let end = u32::from_le_bytes(data[pos+12..pos+16].try_into().unwrap());
pc_to_span.push((pc, SourceSpan { file_id, start, end }));
pos += 16;
}
// Function names table
if pos + 4 > data.len() {
return Err(LoadError::UnexpectedEof);
}
let func_name_count = u32::from_le_bytes(data[pos..pos+4].try_into().unwrap()) as usize;
pos += 4;
let mut function_names = Vec::with_capacity(func_name_count);
for _ in 0..func_name_count {
if pos + 8 > data.len() {
return Err(LoadError::UnexpectedEof);
}
let func_idx = u32::from_le_bytes(data[pos..pos+4].try_into().unwrap());
let name_len = u32::from_le_bytes(data[pos+4..pos+8].try_into().unwrap()) as usize;
pos += 8;
if pos + name_len > data.len() {
return Err(LoadError::UnexpectedEof);
}
let name = String::from_utf8_lossy(&data[pos..pos+name_len]).into_owned();
function_names.push((func_idx, name));
pos += name_len;
}
Ok(DebugInfo { pc_to_span, function_names })
}
fn parse_exports(data: &[u8]) -> Result<Vec<Export>, LoadError> {
if data.is_empty() {
return Ok(Vec::new());
}
if data.len() < 4 {
return Err(LoadError::MalformedSection);
}
let count = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
let mut exports = Vec::with_capacity(count);
let mut pos = 4;
for _ in 0..count {
if pos + 8 > data.len() {
return Err(LoadError::UnexpectedEof);
}
let func_idx = u32::from_le_bytes(data[pos..pos+4].try_into().unwrap());
let name_len = u32::from_le_bytes(data[pos+4..pos+8].try_into().unwrap()) as usize;
pos += 8;
if pos + name_len > data.len() {
return Err(LoadError::UnexpectedEof);
}
let symbol = String::from_utf8_lossy(&data[pos..pos+name_len]).into_owned();
exports.push(Export { symbol, func_idx });
pos += name_len;
}
Ok(exports)
}
fn parse_imports(data: &[u8]) -> Result<Vec<Import>, LoadError> {
if data.is_empty() {
return Ok(Vec::new());
}
if data.len() < 4 {
return Err(LoadError::MalformedSection);
}
let count = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
let mut imports = Vec::with_capacity(count);
let mut pos = 4;
for _ in 0..count {
if pos + 8 > data.len() {
return Err(LoadError::UnexpectedEof);
}
let relocation_count = u32::from_le_bytes(data[pos..pos+4].try_into().unwrap()) as usize;
let name_len = u32::from_le_bytes(data[pos+4..pos+8].try_into().unwrap()) as usize;
pos += 8;
if pos + name_len > data.len() {
return Err(LoadError::UnexpectedEof);
}
let symbol = String::from_utf8_lossy(&data[pos..pos+name_len]).into_owned();
pos += name_len;
if pos + relocation_count * 4 > data.len() {
return Err(LoadError::UnexpectedEof);
}
let mut relocation_pcs = Vec::with_capacity(relocation_count);
for _ in 0..relocation_count {
let pc = u32::from_le_bytes(data[pos..pos+4].try_into().unwrap());
relocation_pcs.push(pc);
pos += 4;
}
imports.push(Import { symbol, relocation_pcs });
}
Ok(imports)
}
fn validate_module(module: &BytecodeModule) -> Result<(), LoadError> {
for func in &module.functions {
// Opcode stream bounds
if (func.code_offset as usize) + (func.code_len as usize) > module.code.len() {
return Err(LoadError::InvalidFunctionIndex);
}
}
// Basic opcode scan for const pool indices
let mut pos = 0;
while pos < module.code.len() {
if pos + 2 > module.code.len() {
break; // Unexpected EOF in middle of opcode, maybe should be error
}
let op_val = u16::from_le_bytes([module.code[pos], module.code[pos+1]]);
let opcode = OpCode::try_from(op_val).map_err(|_| LoadError::InvalidOpcode)?;
pos += 2;
match opcode {
OpCode::PushConst => {
if pos + 4 > module.code.len() { return Err(LoadError::UnexpectedEof); }
let idx = u32::from_le_bytes(module.code[pos..pos+4].try_into().unwrap()) as usize;
if idx >= module.const_pool.len() {
return Err(LoadError::InvalidConstIndex);
}
pos += 4;
}
OpCode::PushI32 | OpCode::PushBounded | OpCode::Jmp | OpCode::JmpIfFalse | OpCode::JmpIfTrue
| OpCode::GetGlobal | OpCode::SetGlobal | OpCode::GetLocal | OpCode::SetLocal
| OpCode::PopN | OpCode::Syscall | OpCode::GateLoad | OpCode::GateStore => {
pos += 4;
}
OpCode::PushI64 | OpCode::PushF64 => {
pos += 8;
}
OpCode::PushBool => {
pos += 1;
}
OpCode::Call => {
pos += 4;
}
OpCode::Alloc => {
pos += 8;
}
_ => {}
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn create_header(section_count: u32) -> Vec<u8> {
let mut h = vec![0u8; 32];
h[0..4].copy_from_slice(b"PBS\0");
h[4..6].copy_from_slice(&0u16.to_le_bytes()); // version
h[6] = 0; // endianness
h[8..12].copy_from_slice(&section_count.to_le_bytes());
h
}
#[test]
fn test_invalid_magic() {
let mut data = create_header(0);
data[0] = b'X';
assert_eq!(BytecodeLoader::load(&data), Err(LoadError::InvalidMagic));
}
#[test]
fn test_invalid_version() {
let mut data = create_header(0);
data[4] = 1;
assert_eq!(BytecodeLoader::load(&data), Err(LoadError::InvalidVersion));
}
#[test]
fn test_invalid_endianness() {
let mut data = create_header(0);
data[6] = 1;
assert_eq!(BytecodeLoader::load(&data), Err(LoadError::InvalidEndianness));
}
#[test]
fn test_overlapping_sections() {
let mut data = create_header(2);
// Section 1: Kind 0, Offset 64, Length 32
data.extend_from_slice(&0u32.to_le_bytes());
data.extend_from_slice(&64u32.to_le_bytes());
data.extend_from_slice(&32u32.to_le_bytes());
// Section 2: Kind 1, Offset 80, Length 32 (Overlaps with Section 1)
data.extend_from_slice(&1u32.to_le_bytes());
data.extend_from_slice(&80u32.to_le_bytes());
data.extend_from_slice(&32u32.to_le_bytes());
// Ensure data is long enough for the offsets
data.resize(256, 0);
assert_eq!(BytecodeLoader::load(&data), Err(LoadError::OverlappingSections));
}
#[test]
fn test_section_out_of_bounds() {
let mut data = create_header(1);
// Section 1: Kind 0, Offset 64, Length 1000
data.extend_from_slice(&0u32.to_le_bytes());
data.extend_from_slice(&64u32.to_le_bytes());
data.extend_from_slice(&1000u32.to_le_bytes());
data.resize(256, 0);
assert_eq!(BytecodeLoader::load(&data), Err(LoadError::SectionOutOfBounds));
}
#[test]
fn test_invalid_function_code_offset() {
let mut data = create_header(2);
// Section 1: Functions, Kind 1, Offset 64, Length 20 (Header 4 + 1 entry 16)
data.extend_from_slice(&1u32.to_le_bytes());
data.extend_from_slice(&64u32.to_le_bytes());
data.extend_from_slice(&20u32.to_le_bytes());
// Section 2: Code, Kind 2, Offset 128, Length 10
data.extend_from_slice(&2u32.to_le_bytes());
data.extend_from_slice(&128u32.to_le_bytes());
data.extend_from_slice(&10u32.to_le_bytes());
data.resize(256, 0);
// Setup functions section
let func_data_start = 64;
data[func_data_start..func_data_start+4].copy_from_slice(&1u32.to_le_bytes()); // 1 function
let entry_start = func_data_start + 4;
data[entry_start..entry_start+4].copy_from_slice(&5u32.to_le_bytes()); // code_offset = 5
data[entry_start+4..entry_start+8].copy_from_slice(&10u32.to_le_bytes()); // code_len = 10
// 5 + 10 = 15 > 10 (code section length)
assert_eq!(BytecodeLoader::load(&data), Err(LoadError::InvalidFunctionIndex));
}
#[test]
fn test_invalid_const_index() {
let mut data = create_header(2);
// Section 1: Const Pool, Kind 0, Offset 64, Length 4 (Empty CP)
data.extend_from_slice(&0u32.to_le_bytes());
data.extend_from_slice(&64u32.to_le_bytes());
data.extend_from_slice(&4u32.to_le_bytes());
// Section 2: Code, Kind 2, Offset 128, Length 6 (PushConst 0)
data.extend_from_slice(&2u32.to_le_bytes());
data.extend_from_slice(&128u32.to_le_bytes());
data.extend_from_slice(&6u32.to_le_bytes());
data.resize(256, 0);
// Setup empty CP
data[64..68].copy_from_slice(&0u32.to_le_bytes());
// Setup code with PushConst 0
data[128..130].copy_from_slice(&(OpCode::PushConst as u16).to_le_bytes());
data[130..134].copy_from_slice(&0u32.to_le_bytes());
assert_eq!(BytecodeLoader::load(&data), Err(LoadError::InvalidConstIndex));
}
#[test]
fn test_valid_minimal_load() {
let data = create_header(0);
let module = BytecodeLoader::load(&data).unwrap();
assert_eq!(module.version, 0);
assert!(module.const_pool.is_empty());
assert!(module.functions.is_empty());
assert!(module.code.is_empty());
}
}