diff --git a/docs/pbs/pull-requests/PR-012-pbs-byte-offset-spans.md b/docs/pbs/pull-requests/PR-012-pbs-byte-offset-spans.md deleted file mode 100644 index 071071bf..00000000 --- a/docs/pbs/pull-requests/PR-012-pbs-byte-offset-spans.md +++ /dev/null @@ -1,45 +0,0 @@ -# PR-012 - PBS Lexer Byte-Offset Spans - -## Briefing - -Lexer spans are currently tracked as Java `String` character indices. The PBS syntax spec requires stable byte offsets. This PR aligns token/span attribution with byte offsets and keeps diagnostics deterministic. - -## Motivation - -Without byte offsets, diagnostics and downstream attribution diverge on non-ASCII sources, violating the lexical contract. - -## Target - -- `prometeu-frontend-pbs` lexer and span attribution behavior. -- Diagnostics and AST attribution consumers that depend on lexer spans. - -## Scope - -- Convert lexer position accounting to UTF-8 byte offsets. -- Preserve existing tokenization semantics. -- Keep parser/semantics APIs unchanged. - -## Method - -- Introduce byte-accurate cursor accounting in lexer scanning. -- Emit token start/end using byte offsets. -- Validate compatibility with parser and diagnostics sinks. -- Add regression fixtures with non-ASCII source content. - -## Acceptance Criteria - -- All emitted tokens include UTF-8 byte offsets. -- Diagnostics from lexer/parser over non-ASCII sources point to correct byte spans. -- Existing ASCII tests remain green. -- New non-ASCII span tests are added and deterministic. - -## Tests - -- Extend lexer tests with UTF-8 multibyte identifiers/strings. -- Add parser span-attribution tests over multibyte source. -- Run full `prometeu-frontend-pbs` test suite. - -## Non-Goals - -- Changing token classes or grammar. -- Changing message wording policy. diff --git a/prometeu-compiler/frontends/prometeu-frontend-pbs/src/main/java/p/studio/compiler/pbs/lexer/PbsLexer.java b/prometeu-compiler/frontends/prometeu-frontend-pbs/src/main/java/p/studio/compiler/pbs/lexer/PbsLexer.java index dc3d5171..a9be86f7 100644 --- a/prometeu-compiler/frontends/prometeu-frontend-pbs/src/main/java/p/studio/compiler/pbs/lexer/PbsLexer.java +++ b/prometeu-compiler/frontends/prometeu-frontend-pbs/src/main/java/p/studio/compiler/pbs/lexer/PbsLexer.java @@ -27,6 +27,8 @@ public final class PbsLexer { private int start; private int current; + private int startByte; + private int currentByte; private LexerState state = LexerState.DEFAULT; private PbsLexer(final String source, final FileId fileId, final DiagnosticSink diagnostics) { @@ -53,7 +55,7 @@ public final class PbsLexer { case LINE_COMMENT -> scanLineCommentState(); } } - tokens.add(new PbsToken(PbsTokenKind.EOF, "", current, current)); + tokens.add(new PbsToken(PbsTokenKind.EOF, "", currentByte, currentByte)); return ReadOnlyList.wrap(tokens); } @@ -62,7 +64,8 @@ public final class PbsLexer { return; } start = current; - final char c = advance(); + startByte = currentByte; + final int c = advance(); switch (c) { case ' ', '\r', '\t', '\n' -> { // Deliberately ignored. @@ -130,7 +133,7 @@ public final class PbsLexer { state = LexerState.IDENTIFIER; return; } - report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(c)); + report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(Character.toString(c))); } } } @@ -174,16 +177,16 @@ public final class PbsLexer { private void scanStringState() { while (!isAtEnd() && peek() != '"') { if (peek() == '\\') { - final var escapeStart = current; + final var escapeStart = currentByte; advance(); if (!isAtEnd()) { final var escaped = advance(); if (!isValidStringEscape(escaped)) { report( LexErrors.E_LEX_INVALID_STRING_ESCAPE, - "Invalid string escape '\\%s'".formatted(escaped), + "Invalid string escape '\\%s'".formatted(Character.toString(escaped)), escapeStart, - current); + currentByte); } } continue; @@ -213,47 +216,57 @@ public final class PbsLexer { private void addToken(final PbsTokenKind kind) { final String lexeme = source.substring(start, current); - tokens.add(new PbsToken(kind, lexeme, start, current)); + tokens.add(new PbsToken(kind, lexeme, startByte, currentByte)); } private boolean match(final char expected) { if (isAtEnd()) return false; - if (source.charAt(current) != expected) return false; + final var codePoint = source.codePointAt(current); + if (Character.charCount(codePoint) != 1 || codePoint != expected) { + return false; + } current++; + currentByte++; return true; } - private char advance() { - return source.charAt(current++); + private int advance() { + final int codePoint = source.codePointAt(current); + current += Character.charCount(codePoint); + currentByte += utf8Bytes(codePoint); + return codePoint; } - private char peek() { + private int peek() { if (isAtEnd()) return '\0'; - return source.charAt(current); + return source.codePointAt(current); } - private char peekNext() { - if (current + 1 >= source.length()) return '\0'; - return source.charAt(current + 1); + private int peekNext() { + if (isAtEnd()) return '\0'; + final int first = source.codePointAt(current); + final int nextIndex = current + Character.charCount(first); + if (nextIndex >= source.length()) return '\0'; + return source.codePointAt(nextIndex); } private boolean isAtEnd() { return current >= source.length(); } - private boolean isDigit(final char c) { + private boolean isDigit(final int c) { return c >= '0' && c <= '9'; } - private boolean isIdentifierStart(final char c) { + private boolean isIdentifierStart(final int c) { return c == '_' || Character.isAlphabetic(c); } - private boolean isIdentifierPart(final char c) { + private boolean isIdentifierPart(final int c) { return c == '_' || Character.isAlphabetic(c) || Character.isDigit(c); } - private boolean isValidStringEscape(final char escaped) { + private boolean isValidStringEscape(final int escaped) { return escaped == '\\' || escaped == '"' || escaped == 'n' @@ -262,17 +275,30 @@ public final class PbsLexer { } private void report(final LexErrors lexErrors, final String message) { - report(lexErrors, message, start, current); + report(lexErrors, message, startByte, currentByte); } private void report( final LexErrors lexErrors, final String message, - final long spanStart, - final long spanEnd) { + final int spanStart, + final int spanEnd) { p.studio.compiler.source.diagnostics.Diagnostics.error(diagnostics, lexErrors.name(), message, new Span(fileId, spanStart, spanEnd)); } + private int utf8Bytes(final int codePoint) { + if (codePoint <= 0x7F) { + return 1; + } + if (codePoint <= 0x7FF) { + return 2; + } + if (codePoint <= 0xFFFF) { + return 3; + } + return 4; + } + private static Map buildKeywords() { final var map = new HashMap(); map.put("import", PbsTokenKind.IMPORT); diff --git a/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/lexer/PbsLexerTest.java b/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/lexer/PbsLexerTest.java index af0fd163..90168dbb 100644 --- a/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/lexer/PbsLexerTest.java +++ b/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/lexer/PbsLexerTest.java @@ -4,6 +4,7 @@ import org.junit.jupiter.api.Test; import p.studio.compiler.source.diagnostics.DiagnosticSink; import p.studio.compiler.source.identifiers.FileId; +import java.nio.charset.StandardCharsets; import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -134,4 +135,22 @@ class PbsLexerTest { assertEquals(LexErrors.E_LEX_INVALID_CHAR.name(), diagnostics.stream().findFirst().orElseThrow().getCode()); } + + @Test + void shouldEmitUtf8ByteOffsetsForNonAsciiIdentifier() { + final var source = "fn cafe\u00E9() -> int { return 1; }"; + final var diagnostics = DiagnosticSink.empty(); + + final var tokens = PbsLexer.lex(source, new FileId(0), diagnostics); + + assertTrue(diagnostics.isEmpty(), "Valid non-ASCII identifier should lex without diagnostics"); + + final var identifier = tokens.get(1); + final int expectedStart = "fn ".getBytes(StandardCharsets.UTF_8).length; + final int expectedEnd = "fn cafe\u00E9".getBytes(StandardCharsets.UTF_8).length; + + assertEquals(PbsTokenKind.IDENTIFIER, identifier.kind()); + assertEquals(expectedStart, identifier.start()); + assertEquals(expectedEnd, identifier.end()); + } } diff --git a/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/parser/PbsParserTest.java b/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/parser/PbsParserTest.java index 2d7b611e..7e4d989c 100644 --- a/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/parser/PbsParserTest.java +++ b/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/parser/PbsParserTest.java @@ -6,6 +6,8 @@ import p.studio.compiler.pbs.lexer.PbsLexer; import p.studio.compiler.source.diagnostics.DiagnosticSink; import p.studio.compiler.source.identifiers.FileId; +import java.nio.charset.StandardCharsets; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNull; @@ -262,4 +264,20 @@ class PbsParserTest { assertInstanceOf(PbsAst.InvalidDecl.class, ast.topDecls().get(0)); assertInstanceOf(PbsAst.FunctionDecl.class, ast.topDecls().get(1)); } + + @Test + void shouldPropagateUtf8ByteOffsetsToAstSpans() { + final var source = "fn cafe\u00E9() -> int { return 1; }"; + final var diagnostics = DiagnosticSink.empty(); + final var fileId = new FileId(0); + + final PbsAst.File ast = PbsParser.parse(PbsLexer.lex(source, fileId, diagnostics), fileId, diagnostics); + + assertTrue(diagnostics.isEmpty(), "Valid non-ASCII source should parse without diagnostics"); + + final int expectedEnd = source.getBytes(StandardCharsets.UTF_8).length; + final var fn = ast.functions().getFirst(); + assertEquals(expectedEnd, ast.span().getEnd()); + assertEquals(expectedEnd, fn.span().getEnd()); + } }