implements PR012
This commit is contained in:
parent
c83a31c3b6
commit
7dfaf6b49b
@ -1,45 +0,0 @@
|
||||
# PR-012 - PBS Lexer Byte-Offset Spans
|
||||
|
||||
## Briefing
|
||||
|
||||
Lexer spans are currently tracked as Java `String` character indices. The PBS syntax spec requires stable byte offsets. This PR aligns token/span attribution with byte offsets and keeps diagnostics deterministic.
|
||||
|
||||
## Motivation
|
||||
|
||||
Without byte offsets, diagnostics and downstream attribution diverge on non-ASCII sources, violating the lexical contract.
|
||||
|
||||
## Target
|
||||
|
||||
- `prometeu-frontend-pbs` lexer and span attribution behavior.
|
||||
- Diagnostics and AST attribution consumers that depend on lexer spans.
|
||||
|
||||
## Scope
|
||||
|
||||
- Convert lexer position accounting to UTF-8 byte offsets.
|
||||
- Preserve existing tokenization semantics.
|
||||
- Keep parser/semantics APIs unchanged.
|
||||
|
||||
## Method
|
||||
|
||||
- Introduce byte-accurate cursor accounting in lexer scanning.
|
||||
- Emit token start/end using byte offsets.
|
||||
- Validate compatibility with parser and diagnostics sinks.
|
||||
- Add regression fixtures with non-ASCII source content.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
- All emitted tokens include UTF-8 byte offsets.
|
||||
- Diagnostics from lexer/parser over non-ASCII sources point to correct byte spans.
|
||||
- Existing ASCII tests remain green.
|
||||
- New non-ASCII span tests are added and deterministic.
|
||||
|
||||
## Tests
|
||||
|
||||
- Extend lexer tests with UTF-8 multibyte identifiers/strings.
|
||||
- Add parser span-attribution tests over multibyte source.
|
||||
- Run full `prometeu-frontend-pbs` test suite.
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Changing token classes or grammar.
|
||||
- Changing message wording policy.
|
||||
@ -27,6 +27,8 @@ public final class PbsLexer {
|
||||
|
||||
private int start;
|
||||
private int current;
|
||||
private int startByte;
|
||||
private int currentByte;
|
||||
private LexerState state = LexerState.DEFAULT;
|
||||
|
||||
private PbsLexer(final String source, final FileId fileId, final DiagnosticSink diagnostics) {
|
||||
@ -53,7 +55,7 @@ public final class PbsLexer {
|
||||
case LINE_COMMENT -> scanLineCommentState();
|
||||
}
|
||||
}
|
||||
tokens.add(new PbsToken(PbsTokenKind.EOF, "", current, current));
|
||||
tokens.add(new PbsToken(PbsTokenKind.EOF, "", currentByte, currentByte));
|
||||
return ReadOnlyList.wrap(tokens);
|
||||
}
|
||||
|
||||
@ -62,7 +64,8 @@ public final class PbsLexer {
|
||||
return;
|
||||
}
|
||||
start = current;
|
||||
final char c = advance();
|
||||
startByte = currentByte;
|
||||
final int c = advance();
|
||||
switch (c) {
|
||||
case ' ', '\r', '\t', '\n' -> {
|
||||
// Deliberately ignored.
|
||||
@ -130,7 +133,7 @@ public final class PbsLexer {
|
||||
state = LexerState.IDENTIFIER;
|
||||
return;
|
||||
}
|
||||
report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(c));
|
||||
report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(Character.toString(c)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -174,16 +177,16 @@ public final class PbsLexer {
|
||||
private void scanStringState() {
|
||||
while (!isAtEnd() && peek() != '"') {
|
||||
if (peek() == '\\') {
|
||||
final var escapeStart = current;
|
||||
final var escapeStart = currentByte;
|
||||
advance();
|
||||
if (!isAtEnd()) {
|
||||
final var escaped = advance();
|
||||
if (!isValidStringEscape(escaped)) {
|
||||
report(
|
||||
LexErrors.E_LEX_INVALID_STRING_ESCAPE,
|
||||
"Invalid string escape '\\%s'".formatted(escaped),
|
||||
"Invalid string escape '\\%s'".formatted(Character.toString(escaped)),
|
||||
escapeStart,
|
||||
current);
|
||||
currentByte);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
@ -213,47 +216,57 @@ public final class PbsLexer {
|
||||
|
||||
private void addToken(final PbsTokenKind kind) {
|
||||
final String lexeme = source.substring(start, current);
|
||||
tokens.add(new PbsToken(kind, lexeme, start, current));
|
||||
tokens.add(new PbsToken(kind, lexeme, startByte, currentByte));
|
||||
}
|
||||
|
||||
private boolean match(final char expected) {
|
||||
if (isAtEnd()) return false;
|
||||
if (source.charAt(current) != expected) return false;
|
||||
final var codePoint = source.codePointAt(current);
|
||||
if (Character.charCount(codePoint) != 1 || codePoint != expected) {
|
||||
return false;
|
||||
}
|
||||
current++;
|
||||
currentByte++;
|
||||
return true;
|
||||
}
|
||||
|
||||
private char advance() {
|
||||
return source.charAt(current++);
|
||||
private int advance() {
|
||||
final int codePoint = source.codePointAt(current);
|
||||
current += Character.charCount(codePoint);
|
||||
currentByte += utf8Bytes(codePoint);
|
||||
return codePoint;
|
||||
}
|
||||
|
||||
private char peek() {
|
||||
private int peek() {
|
||||
if (isAtEnd()) return '\0';
|
||||
return source.charAt(current);
|
||||
return source.codePointAt(current);
|
||||
}
|
||||
|
||||
private char peekNext() {
|
||||
if (current + 1 >= source.length()) return '\0';
|
||||
return source.charAt(current + 1);
|
||||
private int peekNext() {
|
||||
if (isAtEnd()) return '\0';
|
||||
final int first = source.codePointAt(current);
|
||||
final int nextIndex = current + Character.charCount(first);
|
||||
if (nextIndex >= source.length()) return '\0';
|
||||
return source.codePointAt(nextIndex);
|
||||
}
|
||||
|
||||
private boolean isAtEnd() {
|
||||
return current >= source.length();
|
||||
}
|
||||
|
||||
private boolean isDigit(final char c) {
|
||||
private boolean isDigit(final int c) {
|
||||
return c >= '0' && c <= '9';
|
||||
}
|
||||
|
||||
private boolean isIdentifierStart(final char c) {
|
||||
private boolean isIdentifierStart(final int c) {
|
||||
return c == '_' || Character.isAlphabetic(c);
|
||||
}
|
||||
|
||||
private boolean isIdentifierPart(final char c) {
|
||||
private boolean isIdentifierPart(final int c) {
|
||||
return c == '_' || Character.isAlphabetic(c) || Character.isDigit(c);
|
||||
}
|
||||
|
||||
private boolean isValidStringEscape(final char escaped) {
|
||||
private boolean isValidStringEscape(final int escaped) {
|
||||
return escaped == '\\'
|
||||
|| escaped == '"'
|
||||
|| escaped == 'n'
|
||||
@ -262,17 +275,30 @@ public final class PbsLexer {
|
||||
}
|
||||
|
||||
private void report(final LexErrors lexErrors, final String message) {
|
||||
report(lexErrors, message, start, current);
|
||||
report(lexErrors, message, startByte, currentByte);
|
||||
}
|
||||
|
||||
private void report(
|
||||
final LexErrors lexErrors,
|
||||
final String message,
|
||||
final long spanStart,
|
||||
final long spanEnd) {
|
||||
final int spanStart,
|
||||
final int spanEnd) {
|
||||
p.studio.compiler.source.diagnostics.Diagnostics.error(diagnostics, lexErrors.name(), message, new Span(fileId, spanStart, spanEnd));
|
||||
}
|
||||
|
||||
private int utf8Bytes(final int codePoint) {
|
||||
if (codePoint <= 0x7F) {
|
||||
return 1;
|
||||
}
|
||||
if (codePoint <= 0x7FF) {
|
||||
return 2;
|
||||
}
|
||||
if (codePoint <= 0xFFFF) {
|
||||
return 3;
|
||||
}
|
||||
return 4;
|
||||
}
|
||||
|
||||
private static Map<String, PbsTokenKind> buildKeywords() {
|
||||
final var map = new HashMap<String, PbsTokenKind>();
|
||||
map.put("import", PbsTokenKind.IMPORT);
|
||||
|
||||
@ -4,6 +4,7 @@ import org.junit.jupiter.api.Test;
|
||||
import p.studio.compiler.source.diagnostics.DiagnosticSink;
|
||||
import p.studio.compiler.source.identifiers.FileId;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
@ -134,4 +135,22 @@ class PbsLexerTest {
|
||||
assertEquals(LexErrors.E_LEX_INVALID_CHAR.name(),
|
||||
diagnostics.stream().findFirst().orElseThrow().getCode());
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldEmitUtf8ByteOffsetsForNonAsciiIdentifier() {
|
||||
final var source = "fn cafe\u00E9() -> int { return 1; }";
|
||||
final var diagnostics = DiagnosticSink.empty();
|
||||
|
||||
final var tokens = PbsLexer.lex(source, new FileId(0), diagnostics);
|
||||
|
||||
assertTrue(diagnostics.isEmpty(), "Valid non-ASCII identifier should lex without diagnostics");
|
||||
|
||||
final var identifier = tokens.get(1);
|
||||
final int expectedStart = "fn ".getBytes(StandardCharsets.UTF_8).length;
|
||||
final int expectedEnd = "fn cafe\u00E9".getBytes(StandardCharsets.UTF_8).length;
|
||||
|
||||
assertEquals(PbsTokenKind.IDENTIFIER, identifier.kind());
|
||||
assertEquals(expectedStart, identifier.start());
|
||||
assertEquals(expectedEnd, identifier.end());
|
||||
}
|
||||
}
|
||||
|
||||
@ -6,6 +6,8 @@ import p.studio.compiler.pbs.lexer.PbsLexer;
|
||||
import p.studio.compiler.source.diagnostics.DiagnosticSink;
|
||||
import p.studio.compiler.source.identifiers.FileId;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
@ -262,4 +264,20 @@ class PbsParserTest {
|
||||
assertInstanceOf(PbsAst.InvalidDecl.class, ast.topDecls().get(0));
|
||||
assertInstanceOf(PbsAst.FunctionDecl.class, ast.topDecls().get(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldPropagateUtf8ByteOffsetsToAstSpans() {
|
||||
final var source = "fn cafe\u00E9() -> int { return 1; }";
|
||||
final var diagnostics = DiagnosticSink.empty();
|
||||
final var fileId = new FileId(0);
|
||||
|
||||
final PbsAst.File ast = PbsParser.parse(PbsLexer.lex(source, fileId, diagnostics), fileId, diagnostics);
|
||||
|
||||
assertTrue(diagnostics.isEmpty(), "Valid non-ASCII source should parse without diagnostics");
|
||||
|
||||
final int expectedEnd = source.getBytes(StandardCharsets.UTF_8).length;
|
||||
final var fn = ast.functions().getFirst();
|
||||
assertEquals(expectedEnd, ast.span().getEnd());
|
||||
assertEquals(expectedEnd, fn.span().getEnd());
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user