implements PR012

This commit is contained in:
bQUARKz 2026-03-05 17:09:40 +00:00
parent c83a31c3b6
commit 7dfaf6b49b
Signed by: bquarkz
SSH Key Fingerprint: SHA256:Z7dgqoglWwoK6j6u4QC87OveEq74WOhFN+gitsxtkf8
4 changed files with 85 additions and 67 deletions

View File

@ -1,45 +0,0 @@
# PR-012 - PBS Lexer Byte-Offset Spans
## Briefing
Lexer spans are currently tracked as Java `String` character indices. The PBS syntax spec requires stable byte offsets. This PR aligns token/span attribution with byte offsets and keeps diagnostics deterministic.
## Motivation
Without byte offsets, diagnostics and downstream attribution diverge on non-ASCII sources, violating the lexical contract.
## Target
- `prometeu-frontend-pbs` lexer and span attribution behavior.
- Diagnostics and AST attribution consumers that depend on lexer spans.
## Scope
- Convert lexer position accounting to UTF-8 byte offsets.
- Preserve existing tokenization semantics.
- Keep parser/semantics APIs unchanged.
## Method
- Introduce byte-accurate cursor accounting in lexer scanning.
- Emit token start/end using byte offsets.
- Validate compatibility with parser and diagnostics sinks.
- Add regression fixtures with non-ASCII source content.
## Acceptance Criteria
- All emitted tokens include UTF-8 byte offsets.
- Diagnostics from lexer/parser over non-ASCII sources point to correct byte spans.
- Existing ASCII tests remain green.
- New non-ASCII span tests are added and deterministic.
## Tests
- Extend lexer tests with UTF-8 multibyte identifiers/strings.
- Add parser span-attribution tests over multibyte source.
- Run full `prometeu-frontend-pbs` test suite.
## Non-Goals
- Changing token classes or grammar.
- Changing message wording policy.

View File

@ -27,6 +27,8 @@ public final class PbsLexer {
private int start; private int start;
private int current; private int current;
private int startByte;
private int currentByte;
private LexerState state = LexerState.DEFAULT; private LexerState state = LexerState.DEFAULT;
private PbsLexer(final String source, final FileId fileId, final DiagnosticSink diagnostics) { private PbsLexer(final String source, final FileId fileId, final DiagnosticSink diagnostics) {
@ -53,7 +55,7 @@ public final class PbsLexer {
case LINE_COMMENT -> scanLineCommentState(); case LINE_COMMENT -> scanLineCommentState();
} }
} }
tokens.add(new PbsToken(PbsTokenKind.EOF, "", current, current)); tokens.add(new PbsToken(PbsTokenKind.EOF, "", currentByte, currentByte));
return ReadOnlyList.wrap(tokens); return ReadOnlyList.wrap(tokens);
} }
@ -62,7 +64,8 @@ public final class PbsLexer {
return; return;
} }
start = current; start = current;
final char c = advance(); startByte = currentByte;
final int c = advance();
switch (c) { switch (c) {
case ' ', '\r', '\t', '\n' -> { case ' ', '\r', '\t', '\n' -> {
// Deliberately ignored. // Deliberately ignored.
@ -130,7 +133,7 @@ public final class PbsLexer {
state = LexerState.IDENTIFIER; state = LexerState.IDENTIFIER;
return; return;
} }
report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(c)); report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(Character.toString(c)));
} }
} }
} }
@ -174,16 +177,16 @@ public final class PbsLexer {
private void scanStringState() { private void scanStringState() {
while (!isAtEnd() && peek() != '"') { while (!isAtEnd() && peek() != '"') {
if (peek() == '\\') { if (peek() == '\\') {
final var escapeStart = current; final var escapeStart = currentByte;
advance(); advance();
if (!isAtEnd()) { if (!isAtEnd()) {
final var escaped = advance(); final var escaped = advance();
if (!isValidStringEscape(escaped)) { if (!isValidStringEscape(escaped)) {
report( report(
LexErrors.E_LEX_INVALID_STRING_ESCAPE, LexErrors.E_LEX_INVALID_STRING_ESCAPE,
"Invalid string escape '\\%s'".formatted(escaped), "Invalid string escape '\\%s'".formatted(Character.toString(escaped)),
escapeStart, escapeStart,
current); currentByte);
} }
} }
continue; continue;
@ -213,47 +216,57 @@ public final class PbsLexer {
private void addToken(final PbsTokenKind kind) { private void addToken(final PbsTokenKind kind) {
final String lexeme = source.substring(start, current); final String lexeme = source.substring(start, current);
tokens.add(new PbsToken(kind, lexeme, start, current)); tokens.add(new PbsToken(kind, lexeme, startByte, currentByte));
} }
private boolean match(final char expected) { private boolean match(final char expected) {
if (isAtEnd()) return false; if (isAtEnd()) return false;
if (source.charAt(current) != expected) return false; final var codePoint = source.codePointAt(current);
if (Character.charCount(codePoint) != 1 || codePoint != expected) {
return false;
}
current++; current++;
currentByte++;
return true; return true;
} }
private char advance() { private int advance() {
return source.charAt(current++); final int codePoint = source.codePointAt(current);
current += Character.charCount(codePoint);
currentByte += utf8Bytes(codePoint);
return codePoint;
} }
private char peek() { private int peek() {
if (isAtEnd()) return '\0'; if (isAtEnd()) return '\0';
return source.charAt(current); return source.codePointAt(current);
} }
private char peekNext() { private int peekNext() {
if (current + 1 >= source.length()) return '\0'; if (isAtEnd()) return '\0';
return source.charAt(current + 1); final int first = source.codePointAt(current);
final int nextIndex = current + Character.charCount(first);
if (nextIndex >= source.length()) return '\0';
return source.codePointAt(nextIndex);
} }
private boolean isAtEnd() { private boolean isAtEnd() {
return current >= source.length(); return current >= source.length();
} }
private boolean isDigit(final char c) { private boolean isDigit(final int c) {
return c >= '0' && c <= '9'; return c >= '0' && c <= '9';
} }
private boolean isIdentifierStart(final char c) { private boolean isIdentifierStart(final int c) {
return c == '_' || Character.isAlphabetic(c); return c == '_' || Character.isAlphabetic(c);
} }
private boolean isIdentifierPart(final char c) { private boolean isIdentifierPart(final int c) {
return c == '_' || Character.isAlphabetic(c) || Character.isDigit(c); return c == '_' || Character.isAlphabetic(c) || Character.isDigit(c);
} }
private boolean isValidStringEscape(final char escaped) { private boolean isValidStringEscape(final int escaped) {
return escaped == '\\' return escaped == '\\'
|| escaped == '"' || escaped == '"'
|| escaped == 'n' || escaped == 'n'
@ -262,17 +275,30 @@ public final class PbsLexer {
} }
private void report(final LexErrors lexErrors, final String message) { private void report(final LexErrors lexErrors, final String message) {
report(lexErrors, message, start, current); report(lexErrors, message, startByte, currentByte);
} }
private void report( private void report(
final LexErrors lexErrors, final LexErrors lexErrors,
final String message, final String message,
final long spanStart, final int spanStart,
final long spanEnd) { final int spanEnd) {
p.studio.compiler.source.diagnostics.Diagnostics.error(diagnostics, lexErrors.name(), message, new Span(fileId, spanStart, spanEnd)); p.studio.compiler.source.diagnostics.Diagnostics.error(diagnostics, lexErrors.name(), message, new Span(fileId, spanStart, spanEnd));
} }
private int utf8Bytes(final int codePoint) {
if (codePoint <= 0x7F) {
return 1;
}
if (codePoint <= 0x7FF) {
return 2;
}
if (codePoint <= 0xFFFF) {
return 3;
}
return 4;
}
private static Map<String, PbsTokenKind> buildKeywords() { private static Map<String, PbsTokenKind> buildKeywords() {
final var map = new HashMap<String, PbsTokenKind>(); final var map = new HashMap<String, PbsTokenKind>();
map.put("import", PbsTokenKind.IMPORT); map.put("import", PbsTokenKind.IMPORT);

View File

@ -4,6 +4,7 @@ import org.junit.jupiter.api.Test;
import p.studio.compiler.source.diagnostics.DiagnosticSink; import p.studio.compiler.source.diagnostics.DiagnosticSink;
import p.studio.compiler.source.identifiers.FileId; import p.studio.compiler.source.identifiers.FileId;
import java.nio.charset.StandardCharsets;
import java.util.List; import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
@ -134,4 +135,22 @@ class PbsLexerTest {
assertEquals(LexErrors.E_LEX_INVALID_CHAR.name(), assertEquals(LexErrors.E_LEX_INVALID_CHAR.name(),
diagnostics.stream().findFirst().orElseThrow().getCode()); diagnostics.stream().findFirst().orElseThrow().getCode());
} }
@Test
void shouldEmitUtf8ByteOffsetsForNonAsciiIdentifier() {
final var source = "fn cafe\u00E9() -> int { return 1; }";
final var diagnostics = DiagnosticSink.empty();
final var tokens = PbsLexer.lex(source, new FileId(0), diagnostics);
assertTrue(diagnostics.isEmpty(), "Valid non-ASCII identifier should lex without diagnostics");
final var identifier = tokens.get(1);
final int expectedStart = "fn ".getBytes(StandardCharsets.UTF_8).length;
final int expectedEnd = "fn cafe\u00E9".getBytes(StandardCharsets.UTF_8).length;
assertEquals(PbsTokenKind.IDENTIFIER, identifier.kind());
assertEquals(expectedStart, identifier.start());
assertEquals(expectedEnd, identifier.end());
}
} }

View File

@ -6,6 +6,8 @@ import p.studio.compiler.pbs.lexer.PbsLexer;
import p.studio.compiler.source.diagnostics.DiagnosticSink; import p.studio.compiler.source.diagnostics.DiagnosticSink;
import p.studio.compiler.source.identifiers.FileId; import p.studio.compiler.source.identifiers.FileId;
import java.nio.charset.StandardCharsets;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertInstanceOf;
import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertNull;
@ -262,4 +264,20 @@ class PbsParserTest {
assertInstanceOf(PbsAst.InvalidDecl.class, ast.topDecls().get(0)); assertInstanceOf(PbsAst.InvalidDecl.class, ast.topDecls().get(0));
assertInstanceOf(PbsAst.FunctionDecl.class, ast.topDecls().get(1)); assertInstanceOf(PbsAst.FunctionDecl.class, ast.topDecls().get(1));
} }
@Test
void shouldPropagateUtf8ByteOffsetsToAstSpans() {
final var source = "fn cafe\u00E9() -> int { return 1; }";
final var diagnostics = DiagnosticSink.empty();
final var fileId = new FileId(0);
final PbsAst.File ast = PbsParser.parse(PbsLexer.lex(source, fileId, diagnostics), fileId, diagnostics);
assertTrue(diagnostics.isEmpty(), "Valid non-ASCII source should parse without diagnostics");
final int expectedEnd = source.getBytes(StandardCharsets.UTF_8).length;
final var fn = ast.functions().getFirst();
assertEquals(expectedEnd, ast.span().getEnd());
assertEquals(expectedEnd, fn.span().getEnd());
}
} }