implements PR012

2026-03-05 17:09:40 +00:00 · 2026-03-05 17:09:40 +00:00 · 7dfaf6b49b
commit 7dfaf6b49b
parent c83a31c3b6
4 changed files with 85 additions and 67 deletions
--- a/docs/pbs/pull-requests/PR-012-pbs-byte-offset-spans.md
+++ b/docs/pbs/pull-requests/PR-012-pbs-byte-offset-spans.md
@ -1,45 +0,0 @@
-# PR-012 - PBS Lexer Byte-Offset Spans
-
-## Briefing
-
-Lexer spans are currently tracked as Java `String` character indices. The PBS syntax spec requires stable byte offsets. This PR aligns token/span attribution with byte offsets and keeps diagnostics deterministic.
-
-## Motivation
-
-Without byte offsets, diagnostics and downstream attribution diverge on non-ASCII sources, violating the lexical contract.
-
-## Target
-
- `prometeu-frontend-pbs` lexer and span attribution behavior.
- Diagnostics and AST attribution consumers that depend on lexer spans.
-
-## Scope
-
- Convert lexer position accounting to UTF-8 byte offsets.
- Preserve existing tokenization semantics.
- Keep parser/semantics APIs unchanged.
-
-## Method
-
- Introduce byte-accurate cursor accounting in lexer scanning.
- Emit token start/end using byte offsets.
- Validate compatibility with parser and diagnostics sinks.
- Add regression fixtures with non-ASCII source content.
-
-## Acceptance Criteria
-
- All emitted tokens include UTF-8 byte offsets.
- Diagnostics from lexer/parser over non-ASCII sources point to correct byte spans.
- Existing ASCII tests remain green.
- New non-ASCII span tests are added and deterministic.
-
-## Tests
-
- Extend lexer tests with UTF-8 multibyte identifiers/strings.
- Add parser span-attribution tests over multibyte source.
- Run full `prometeu-frontend-pbs` test suite.
-
-## Non-Goals
-
- Changing token classes or grammar.
- Changing message wording policy.
--- a/prometeu-compiler/frontends/prometeu-frontend-pbs/src/main/java/p/studio/compiler/pbs/lexer/PbsLexer.java
+++ b/prometeu-compiler/frontends/prometeu-frontend-pbs/src/main/java/p/studio/compiler/pbs/lexer/PbsLexer.java
@ -27,6 +27,8 @@ public final class PbsLexer {

    private int start;
    private int current;
+    private int startByte;
+    private int currentByte;
    private LexerState state = LexerState.DEFAULT;

    private PbsLexer(final String source, final FileId fileId, final DiagnosticSink diagnostics) {
@ -53,7 +55,7 @@ public final class PbsLexer {
                case LINE_COMMENT -> scanLineCommentState();
            }
        }
-        tokens.add(new PbsToken(PbsTokenKind.EOF, "", current, current));
+        tokens.add(new PbsToken(PbsTokenKind.EOF, "", currentByte, currentByte));
        return ReadOnlyList.wrap(tokens);
    }

@ -62,7 +64,8 @@ public final class PbsLexer {
            return;
        }
        start = current;
-        final char c = advance();
+        startByte = currentByte;
+        final int c = advance();
        switch (c) {
            case ' ', '\r', '\t', '\n' -> {
                // Deliberately ignored.
@ -130,7 +133,7 @@ public final class PbsLexer {
                    state = LexerState.IDENTIFIER;
                    return;
                }
-                report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(c));
+                report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(Character.toString(c)));
            }
        }
    }
@ -174,16 +177,16 @@ public final class PbsLexer {
    private void scanStringState() {
        while (!isAtEnd() && peek() != '"') {
            if (peek() == '\\') {
-                final var escapeStart = current;
+                final var escapeStart = currentByte;
                advance();
                if (!isAtEnd()) {
                    final var escaped = advance();
                    if (!isValidStringEscape(escaped)) {
                        report(
                                LexErrors.E_LEX_INVALID_STRING_ESCAPE,
-                                "Invalid string escape '\\%s'".formatted(escaped),
+                                "Invalid string escape '\\%s'".formatted(Character.toString(escaped)),
                                escapeStart,
-                                current);
+                                currentByte);
                    }
                }
                continue;
@ -213,47 +216,57 @@ public final class PbsLexer {

    private void addToken(final PbsTokenKind kind) {
        final String lexeme = source.substring(start, current);
-        tokens.add(new PbsToken(kind, lexeme, start, current));
+        tokens.add(new PbsToken(kind, lexeme, startByte, currentByte));
    }

    private boolean match(final char expected) {
        if (isAtEnd()) return false;
-        if (source.charAt(current) != expected) return false;
+        final var codePoint = source.codePointAt(current);
+        if (Character.charCount(codePoint) != 1 || codePoint != expected) {
+            return false;
+        }
        current++;
+        currentByte++;
        return true;
    }

-    private char advance() {
-        return source.charAt(current++);
+    private int advance() {
+        final int codePoint = source.codePointAt(current);
+        current += Character.charCount(codePoint);
+        currentByte += utf8Bytes(codePoint);
+        return codePoint;
    }

-    private char peek() {
+    private int peek() {
        if (isAtEnd()) return '\0';
-        return source.charAt(current);
+        return source.codePointAt(current);
    }

-    private char peekNext() {
-        if (current + 1 >= source.length()) return '\0';
-        return source.charAt(current + 1);
+    private int peekNext() {
+        if (isAtEnd()) return '\0';
+        final int first = source.codePointAt(current);
+        final int nextIndex = current + Character.charCount(first);
+        if (nextIndex >= source.length()) return '\0';
+        return source.codePointAt(nextIndex);
    }

    private boolean isAtEnd() {
        return current >= source.length();
    }

-    private boolean isDigit(final char c) {
+    private boolean isDigit(final int c) {
        return c >= '0' && c <= '9';
    }

-    private boolean isIdentifierStart(final char c) {
+    private boolean isIdentifierStart(final int c) {
        return c == '_' || Character.isAlphabetic(c);
    }

-    private boolean isIdentifierPart(final char c) {
+    private boolean isIdentifierPart(final int c) {
        return c == '_' || Character.isAlphabetic(c) || Character.isDigit(c);
    }

-    private boolean isValidStringEscape(final char escaped) {
+    private boolean isValidStringEscape(final int escaped) {
        return escaped == '\\'
                || escaped == '"'
                || escaped == 'n'
@ -262,17 +275,30 @@ public final class PbsLexer {
    }

    private void report(final LexErrors lexErrors, final String message) {
-        report(lexErrors, message, start, current);
+        report(lexErrors, message, startByte, currentByte);
    }

    private void report(
            final LexErrors lexErrors,
            final String message,
-            final long spanStart,
-            final long spanEnd) {
+            final int spanStart,
+            final int spanEnd) {
        p.studio.compiler.source.diagnostics.Diagnostics.error(diagnostics, lexErrors.name(), message, new Span(fileId, spanStart, spanEnd));
    }

+    private int utf8Bytes(final int codePoint) {
+        if (codePoint <= 0x7F) {
+            return 1;
+        }
+        if (codePoint <= 0x7FF) {
+            return 2;
+        }
+        if (codePoint <= 0xFFFF) {
+            return 3;
+        }
+        return 4;
+    }
+
    private static Map<String, PbsTokenKind> buildKeywords() {
        final var map = new HashMap<String, PbsTokenKind>();
        map.put("import", PbsTokenKind.IMPORT);
--- a/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/lexer/PbsLexerTest.java
+++ b/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/lexer/PbsLexerTest.java
@ -4,6 +4,7 @@ import org.junit.jupiter.api.Test;
 import p.studio.compiler.source.diagnostics.DiagnosticSink;
 import p.studio.compiler.source.identifiers.FileId;

+import java.nio.charset.StandardCharsets;
 import java.util.List;

 import static org.junit.jupiter.api.Assertions.assertEquals;
@ -134,4 +135,22 @@ class PbsLexerTest {
        assertEquals(LexErrors.E_LEX_INVALID_CHAR.name(),
                diagnostics.stream().findFirst().orElseThrow().getCode());
    }
+
+    @Test
+    void shouldEmitUtf8ByteOffsetsForNonAsciiIdentifier() {
+        final var source = "fn cafe\u00E9() -> int { return 1; }";
+        final var diagnostics = DiagnosticSink.empty();
+
+        final var tokens = PbsLexer.lex(source, new FileId(0), diagnostics);
+
+        assertTrue(diagnostics.isEmpty(), "Valid non-ASCII identifier should lex without diagnostics");
+
+        final var identifier = tokens.get(1);
+        final int expectedStart = "fn ".getBytes(StandardCharsets.UTF_8).length;
+        final int expectedEnd = "fn cafe\u00E9".getBytes(StandardCharsets.UTF_8).length;
+
+        assertEquals(PbsTokenKind.IDENTIFIER, identifier.kind());
+        assertEquals(expectedStart, identifier.start());
+        assertEquals(expectedEnd, identifier.end());
+    }
 }
--- a/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/parser/PbsParserTest.java
+++ b/prometeu-compiler/frontends/prometeu-frontend-pbs/src/test/java/p/studio/compiler/pbs/parser/PbsParserTest.java
@ -6,6 +6,8 @@ import p.studio.compiler.pbs.lexer.PbsLexer;
 import p.studio.compiler.source.diagnostics.DiagnosticSink;
 import p.studio.compiler.source.identifiers.FileId;

+import java.nio.charset.StandardCharsets;
+
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertInstanceOf;
 import static org.junit.jupiter.api.Assertions.assertNull;
@ -262,4 +264,20 @@ class PbsParserTest {
        assertInstanceOf(PbsAst.InvalidDecl.class, ast.topDecls().get(0));
        assertInstanceOf(PbsAst.FunctionDecl.class, ast.topDecls().get(1));
    }
+
+    @Test
+    void shouldPropagateUtf8ByteOffsetsToAstSpans() {
+        final var source = "fn cafe\u00E9() -> int { return 1; }";
+        final var diagnostics = DiagnosticSink.empty();
+        final var fileId = new FileId(0);
+
+        final PbsAst.File ast = PbsParser.parse(PbsLexer.lex(source, fileId, diagnostics), fileId, diagnostics);
+
+        assertTrue(diagnostics.isEmpty(), "Valid non-ASCII source should parse without diagnostics");
+
+        final int expectedEnd = source.getBytes(StandardCharsets.UTF_8).length;
+        final var fn = ast.functions().getFirst();
+        assertEquals(expectedEnd, ast.span().getEnd());
+        assertEquals(expectedEnd, fn.span().getEnd());
+    }
 }