implements PR012

This commit is contained in:
bQUARKz 2026-03-05 17:09:40 +00:00
parent c83a31c3b6
commit 7dfaf6b49b
Signed by: bquarkz
SSH Key Fingerprint: SHA256:Z7dgqoglWwoK6j6u4QC87OveEq74WOhFN+gitsxtkf8
4 changed files with 85 additions and 67 deletions

View File

@ -1,45 +0,0 @@
# PR-012 - PBS Lexer Byte-Offset Spans
## Briefing
Lexer spans are currently tracked as Java `String` character indices. The PBS syntax spec requires stable byte offsets. This PR aligns token/span attribution with byte offsets and keeps diagnostics deterministic.
## Motivation
Without byte offsets, diagnostics and downstream attribution diverge on non-ASCII sources, violating the lexical contract.
## Target
- `prometeu-frontend-pbs` lexer and span attribution behavior.
- Diagnostics and AST attribution consumers that depend on lexer spans.
## Scope
- Convert lexer position accounting to UTF-8 byte offsets.
- Preserve existing tokenization semantics.
- Keep parser/semantics APIs unchanged.
## Method
- Introduce byte-accurate cursor accounting in lexer scanning.
- Emit token start/end using byte offsets.
- Validate compatibility with parser and diagnostics sinks.
- Add regression fixtures with non-ASCII source content.
## Acceptance Criteria
- All emitted tokens include UTF-8 byte offsets.
- Diagnostics from lexer/parser over non-ASCII sources point to correct byte spans.
- Existing ASCII tests remain green.
- New non-ASCII span tests are added and deterministic.
## Tests
- Extend lexer tests with UTF-8 multibyte identifiers/strings.
- Add parser span-attribution tests over multibyte source.
- Run full `prometeu-frontend-pbs` test suite.
## Non-Goals
- Changing token classes or grammar.
- Changing message wording policy.

View File

@ -27,6 +27,8 @@ public final class PbsLexer {
private int start;
private int current;
private int startByte;
private int currentByte;
private LexerState state = LexerState.DEFAULT;
private PbsLexer(final String source, final FileId fileId, final DiagnosticSink diagnostics) {
@ -53,7 +55,7 @@ public final class PbsLexer {
case LINE_COMMENT -> scanLineCommentState();
}
}
tokens.add(new PbsToken(PbsTokenKind.EOF, "", current, current));
tokens.add(new PbsToken(PbsTokenKind.EOF, "", currentByte, currentByte));
return ReadOnlyList.wrap(tokens);
}
@ -62,7 +64,8 @@ public final class PbsLexer {
return;
}
start = current;
final char c = advance();
startByte = currentByte;
final int c = advance();
switch (c) {
case ' ', '\r', '\t', '\n' -> {
// Deliberately ignored.
@ -130,7 +133,7 @@ public final class PbsLexer {
state = LexerState.IDENTIFIER;
return;
}
report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(c));
report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(Character.toString(c)));
}
}
}
@ -174,16 +177,16 @@ public final class PbsLexer {
private void scanStringState() {
while (!isAtEnd() && peek() != '"') {
if (peek() == '\\') {
final var escapeStart = current;
final var escapeStart = currentByte;
advance();
if (!isAtEnd()) {
final var escaped = advance();
if (!isValidStringEscape(escaped)) {
report(
LexErrors.E_LEX_INVALID_STRING_ESCAPE,
"Invalid string escape '\\%s'".formatted(escaped),
"Invalid string escape '\\%s'".formatted(Character.toString(escaped)),
escapeStart,
current);
currentByte);
}
}
continue;
@ -213,47 +216,57 @@ public final class PbsLexer {
private void addToken(final PbsTokenKind kind) {
final String lexeme = source.substring(start, current);
tokens.add(new PbsToken(kind, lexeme, start, current));
tokens.add(new PbsToken(kind, lexeme, startByte, currentByte));
}
private boolean match(final char expected) {
if (isAtEnd()) return false;
if (source.charAt(current) != expected) return false;
final var codePoint = source.codePointAt(current);
if (Character.charCount(codePoint) != 1 || codePoint != expected) {
return false;
}
current++;
currentByte++;
return true;
}
private char advance() {
return source.charAt(current++);
private int advance() {
final int codePoint = source.codePointAt(current);
current += Character.charCount(codePoint);
currentByte += utf8Bytes(codePoint);
return codePoint;
}
private char peek() {
private int peek() {
if (isAtEnd()) return '\0';
return source.charAt(current);
return source.codePointAt(current);
}
private char peekNext() {
if (current + 1 >= source.length()) return '\0';
return source.charAt(current + 1);
private int peekNext() {
if (isAtEnd()) return '\0';
final int first = source.codePointAt(current);
final int nextIndex = current + Character.charCount(first);
if (nextIndex >= source.length()) return '\0';
return source.codePointAt(nextIndex);
}
private boolean isAtEnd() {
return current >= source.length();
}
private boolean isDigit(final char c) {
private boolean isDigit(final int c) {
return c >= '0' && c <= '9';
}
private boolean isIdentifierStart(final char c) {
private boolean isIdentifierStart(final int c) {
return c == '_' || Character.isAlphabetic(c);
}
private boolean isIdentifierPart(final char c) {
private boolean isIdentifierPart(final int c) {
return c == '_' || Character.isAlphabetic(c) || Character.isDigit(c);
}
private boolean isValidStringEscape(final char escaped) {
private boolean isValidStringEscape(final int escaped) {
return escaped == '\\'
|| escaped == '"'
|| escaped == 'n'
@ -262,17 +275,30 @@ public final class PbsLexer {
}
private void report(final LexErrors lexErrors, final String message) {
report(lexErrors, message, start, current);
report(lexErrors, message, startByte, currentByte);
}
private void report(
final LexErrors lexErrors,
final String message,
final long spanStart,
final long spanEnd) {
final int spanStart,
final int spanEnd) {
p.studio.compiler.source.diagnostics.Diagnostics.error(diagnostics, lexErrors.name(), message, new Span(fileId, spanStart, spanEnd));
}
private int utf8Bytes(final int codePoint) {
if (codePoint <= 0x7F) {
return 1;
}
if (codePoint <= 0x7FF) {
return 2;
}
if (codePoint <= 0xFFFF) {
return 3;
}
return 4;
}
private static Map<String, PbsTokenKind> buildKeywords() {
final var map = new HashMap<String, PbsTokenKind>();
map.put("import", PbsTokenKind.IMPORT);

View File

@ -4,6 +4,7 @@ import org.junit.jupiter.api.Test;
import p.studio.compiler.source.diagnostics.DiagnosticSink;
import p.studio.compiler.source.identifiers.FileId;
import java.nio.charset.StandardCharsets;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
@ -134,4 +135,22 @@ class PbsLexerTest {
assertEquals(LexErrors.E_LEX_INVALID_CHAR.name(),
diagnostics.stream().findFirst().orElseThrow().getCode());
}
@Test
void shouldEmitUtf8ByteOffsetsForNonAsciiIdentifier() {
final var source = "fn cafe\u00E9() -> int { return 1; }";
final var diagnostics = DiagnosticSink.empty();
final var tokens = PbsLexer.lex(source, new FileId(0), diagnostics);
assertTrue(diagnostics.isEmpty(), "Valid non-ASCII identifier should lex without diagnostics");
final var identifier = tokens.get(1);
final int expectedStart = "fn ".getBytes(StandardCharsets.UTF_8).length;
final int expectedEnd = "fn cafe\u00E9".getBytes(StandardCharsets.UTF_8).length;
assertEquals(PbsTokenKind.IDENTIFIER, identifier.kind());
assertEquals(expectedStart, identifier.start());
assertEquals(expectedEnd, identifier.end());
}
}

View File

@ -6,6 +6,8 @@ import p.studio.compiler.pbs.lexer.PbsLexer;
import p.studio.compiler.source.diagnostics.DiagnosticSink;
import p.studio.compiler.source.identifiers.FileId;
import java.nio.charset.StandardCharsets;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
import static org.junit.jupiter.api.Assertions.assertNull;
@ -262,4 +264,20 @@ class PbsParserTest {
assertInstanceOf(PbsAst.InvalidDecl.class, ast.topDecls().get(0));
assertInstanceOf(PbsAst.FunctionDecl.class, ast.topDecls().get(1));
}
@Test
void shouldPropagateUtf8ByteOffsetsToAstSpans() {
final var source = "fn cafe\u00E9() -> int { return 1; }";
final var diagnostics = DiagnosticSink.empty();
final var fileId = new FileId(0);
final PbsAst.File ast = PbsParser.parse(PbsLexer.lex(source, fileId, diagnostics), fileId, diagnostics);
assertTrue(diagnostics.isEmpty(), "Valid non-ASCII source should parse without diagnostics");
final int expectedEnd = source.getBytes(StandardCharsets.UTF_8).length;
final var fn = ast.functions().getFirst();
assertEquals(expectedEnd, ast.span().getEnd());
assertEquals(expectedEnd, fn.span().getEnd());
}
}