Nilton Constantino 66a77709f0
pr 59
2026-02-02 17:52:20 +00:00

443 lines
13 KiB
Rust

use super::token::{Token, TokenKind};
use crate::common::spans::Span;
use std::iter::Peekable;
use std::str::Chars;
pub struct Lexer<'a> {
chars: Peekable<Chars<'a>>,
file_id: usize,
pos: u32,
}
impl<'a> Lexer<'a> {
pub fn new(source: &'a str, file_id: usize) -> Self {
Self {
chars: source.chars().peekable(),
file_id,
pos: 0,
}
}
fn peek(&mut self) -> Option<char> {
self.chars.peek().copied()
}
fn next(&mut self) -> Option<char> {
let c = self.chars.next();
if let Some(c) = c {
self.pos += c.len_utf8() as u32;
}
c
}
fn skip_whitespace(&mut self) {
while let Some(c) = self.peek() {
if c.is_whitespace() {
self.next();
} else if c == '/' {
if self.peek_next() == Some('/') {
// Line comment
self.next(); // /
self.next(); // /
while let Some(c) = self.peek() {
if c == '\n' {
break;
}
self.next();
}
} else {
break;
}
} else {
break;
}
}
}
fn peek_next(&self) -> Option<char> {
let mut cloned = self.chars.clone();
cloned.next();
cloned.peek().copied()
}
pub fn next_token(&mut self) -> Token {
self.skip_whitespace();
let start = self.pos;
let c = match self.next() {
Some(c) => c,
None => return Token::new(TokenKind::Eof, Span::new(self.file_id, start, start)),
};
let kind = match c {
'(' => TokenKind::OpenParen,
')' => TokenKind::CloseParen,
'{' => TokenKind::OpenBrace,
'}' => TokenKind::CloseBrace,
'[' => {
if self.peek() == Some('[') {
self.next();
TokenKind::OpenDoubleBracket
} else {
TokenKind::OpenBracket
}
}
']' => {
if self.peek() == Some(']') {
self.next();
TokenKind::CloseDoubleBracket
} else {
TokenKind::CloseBracket
}
}
',' => TokenKind::Comma,
'.' => TokenKind::Dot,
':' => TokenKind::Colon,
';' => TokenKind::Semicolon,
'=' => {
if self.peek() == Some('=') {
self.next();
TokenKind::Eq
} else {
TokenKind::Assign
}
}
'+' => TokenKind::Plus,
'-' => {
if self.peek() == Some('>') {
self.next();
TokenKind::Arrow
} else {
TokenKind::Minus
}
}
'*' => TokenKind::Star,
'/' => TokenKind::Slash,
'%' => TokenKind::Percent,
'!' => {
if self.peek() == Some('=') {
self.next();
TokenKind::Neq
} else {
TokenKind::Not
}
}
'<' => {
if self.peek() == Some('=') {
self.next();
TokenKind::Lte
} else {
TokenKind::Lt
}
}
'>' => {
if self.peek() == Some('=') {
self.next();
TokenKind::Gte
} else {
TokenKind::Gt
}
}
'&' => {
if self.peek() == Some('&') {
self.next();
TokenKind::And
} else {
TokenKind::Invalid("&".to_string())
}
}
'|' => {
if self.peek() == Some('|') {
self.next();
TokenKind::Or
} else {
TokenKind::Invalid("|".to_string())
}
}
'"' => self.lex_string(),
'0'..='9' => self.lex_number(c),
c if is_identifier_start(c) => self.lex_identifier(c),
_ => TokenKind::Invalid(c.to_string()),
};
Token::new(kind, Span::new(self.file_id, start, self.pos))
}
fn lex_string(&mut self) -> TokenKind {
let mut s = String::new();
while let Some(c) = self.peek() {
if c == '"' {
self.next();
return TokenKind::StringLit(s);
}
if c == '\n' {
break; // Unterminated string
}
s.push(self.next().unwrap());
}
TokenKind::Invalid("Unterminated string".to_string())
}
fn lex_number(&mut self, first: char) -> TokenKind {
let mut s = String::new();
s.push(first);
let mut is_float = false;
while let Some(c) = self.peek() {
if c.is_ascii_digit() {
s.push(self.next().unwrap());
} else if c == '.' && !is_float {
if let Some(next_c) = self.peek_next() {
if next_c.is_ascii_digit() {
is_float = true;
s.push(self.next().unwrap()); // .
s.push(self.next().unwrap()); // next digit
} else {
break;
}
} else {
break;
}
} else {
break;
}
}
if self.peek() == Some('b') && !is_float {
self.next(); // consume 'b'
if let Ok(val) = s.parse::<u32>() {
return TokenKind::BoundedLit(val);
}
}
if is_float {
if let Ok(val) = s.parse::<f64>() {
return TokenKind::FloatLit(val);
}
} else {
if let Ok(val) = s.parse::<i64>() {
return TokenKind::IntLit(val);
}
}
TokenKind::Invalid(s)
}
fn lex_identifier(&mut self, first: char) -> TokenKind {
let mut s = String::new();
s.push(first);
while let Some(c) = self.peek() {
if is_identifier_part(c) {
s.push(self.next().unwrap());
} else {
break;
}
}
match s.as_str() {
"import" => TokenKind::Import,
"pub" => TokenKind::Pub,
"mod" => TokenKind::Mod,
"service" => TokenKind::Service,
"fn" => TokenKind::Fn,
"let" => TokenKind::Let,
"mut" => TokenKind::Mut,
"declare" => TokenKind::Declare,
"struct" => TokenKind::Struct,
"contract" => TokenKind::Contract,
"host" => TokenKind::Host,
"error" => TokenKind::Error,
"optional" => TokenKind::Optional,
"result" => TokenKind::Result,
"some" => TokenKind::Some,
"none" => TokenKind::None,
"ok" => TokenKind::Ok,
"err" => TokenKind::Err,
"if" => TokenKind::If,
"else" => TokenKind::Else,
"when" => TokenKind::When,
"for" => TokenKind::For,
"in" => TokenKind::In,
"return" => TokenKind::Return,
"handle" => TokenKind::Handle,
"borrow" => TokenKind::Borrow,
"mutate" => TokenKind::Mutate,
"peek" => TokenKind::Peek,
"take" => TokenKind::Take,
"alloc" => TokenKind::Alloc,
"weak" => TokenKind::Weak,
"as" => TokenKind::As,
"bounded" => TokenKind::Bounded,
_ => TokenKind::Identifier(s),
}
}
}
fn is_identifier_start(c: char) -> bool {
c.is_alphabetic() || c == '_'
}
fn is_identifier_part(c: char) -> bool {
c.is_alphanumeric() || c == '_'
}
#[cfg(test)]
mod tests {
use super::*;
use crate::frontends::pbs::token::TokenKind;
#[test]
fn test_lex_basic_tokens() {
let source = "( ) { } [ ] , . : ; -> = == + - * / % ! != < > <= >= && ||";
let mut lexer = Lexer::new(source, 0);
let expected = vec![
TokenKind::OpenParen, TokenKind::CloseParen,
TokenKind::OpenBrace, TokenKind::CloseBrace,
TokenKind::OpenBracket, TokenKind::CloseBracket,
TokenKind::Comma, TokenKind::Dot, TokenKind::Colon, TokenKind::Semicolon,
TokenKind::Arrow, TokenKind::Assign, TokenKind::Eq,
TokenKind::Plus, TokenKind::Minus, TokenKind::Star, TokenKind::Slash, TokenKind::Percent,
TokenKind::Not, TokenKind::Neq,
TokenKind::Lt, TokenKind::Gt, TokenKind::Lte, TokenKind::Gte,
TokenKind::And, TokenKind::Or,
TokenKind::Eof,
];
for kind in expected {
let token = lexer.next_token();
assert_eq!(token.kind, kind);
}
}
#[test]
fn test_lex_keywords() {
let source = "import pub mod service fn let mut declare struct contract host error optional result some none ok err if else when for in return handle borrow mutate peek take alloc weak as";
let mut lexer = Lexer::new(source, 0);
let expected = vec![
TokenKind::Import, TokenKind::Pub, TokenKind::Mod, TokenKind::Service,
TokenKind::Fn, TokenKind::Let, TokenKind::Mut, TokenKind::Declare,
TokenKind::Struct, TokenKind::Contract, TokenKind::Host, TokenKind::Error,
TokenKind::Optional, TokenKind::Result, TokenKind::Some, TokenKind::None,
TokenKind::Ok, TokenKind::Err, TokenKind::If, TokenKind::Else,
TokenKind::When, TokenKind::For, TokenKind::In, TokenKind::Return,
TokenKind::Handle, TokenKind::Borrow, TokenKind::Mutate, TokenKind::Peek,
TokenKind::Take, TokenKind::Alloc, TokenKind::Weak, TokenKind::As,
TokenKind::Eof,
];
for kind in expected {
let token = lexer.next_token();
assert_eq!(token.kind, kind);
}
}
#[test]
fn test_lex_identifiers() {
let source = "foo bar _baz qux123";
let mut lexer = Lexer::new(source, 0);
let expected = vec![
TokenKind::Identifier("foo".to_string()),
TokenKind::Identifier("bar".to_string()),
TokenKind::Identifier("_baz".to_string()),
TokenKind::Identifier("qux123".to_string()),
TokenKind::Eof,
];
for kind in expected {
let token = lexer.next_token();
assert_eq!(token.kind, kind);
}
}
#[test]
fn test_lex_literals() {
let source = "123 3.14 255b \"hello world\"";
let mut lexer = Lexer::new(source, 0);
let expected = vec![
TokenKind::IntLit(123),
TokenKind::FloatLit(3.14),
TokenKind::BoundedLit(255),
TokenKind::StringLit("hello world".to_string()),
TokenKind::Eof,
];
for kind in expected {
let token = lexer.next_token();
assert_eq!(token.kind, kind);
}
}
#[test]
fn test_lex_comments() {
let source = "let x = 10; // this is a comment\nlet y = 20;";
let mut lexer = Lexer::new(source, 0);
let expected = vec![
TokenKind::Let,
TokenKind::Identifier("x".to_string()),
TokenKind::Assign,
TokenKind::IntLit(10),
TokenKind::Semicolon,
TokenKind::Let,
TokenKind::Identifier("y".to_string()),
TokenKind::Assign,
TokenKind::IntLit(20),
TokenKind::Semicolon,
TokenKind::Eof,
];
for kind in expected {
let token = lexer.next_token();
assert_eq!(token.kind, kind);
}
}
#[test]
fn test_lex_spans() {
let source = "let x = 10;";
let mut lexer = Lexer::new(source, 0);
let t1 = lexer.next_token(); // let
assert_eq!(t1.span.start, 0);
assert_eq!(t1.span.end, 3);
let t2 = lexer.next_token(); // x
assert_eq!(t2.span.start, 4);
assert_eq!(t2.span.end, 5);
let t3 = lexer.next_token(); // =
assert_eq!(t3.span.start, 6);
assert_eq!(t3.span.end, 7);
let t4 = lexer.next_token(); // 10
assert_eq!(t4.span.start, 8);
assert_eq!(t4.span.end, 10);
let t5 = lexer.next_token(); // ;
assert_eq!(t5.span.start, 10);
assert_eq!(t5.span.end, 11);
}
#[test]
fn test_lex_invalid_tokens() {
let source = "@ #";
let mut lexer = Lexer::new(source, 0);
assert!(matches!(lexer.next_token().kind, TokenKind::Invalid(_)));
assert!(matches!(lexer.next_token().kind, TokenKind::Invalid(_)));
assert_eq!(lexer.next_token().kind, TokenKind::Eof);
}
#[test]
fn test_lex_unterminated_string() {
let source = "\"hello";
let mut lexer = Lexer::new(source, 0);
assert!(matches!(lexer.next_token().kind, TokenKind::Invalid(_)));
}
}