implements PR012
This commit is contained in:
parent
c83a31c3b6
commit
7dfaf6b49b
@ -1,45 +0,0 @@
|
|||||||
# PR-012 - PBS Lexer Byte-Offset Spans
|
|
||||||
|
|
||||||
## Briefing
|
|
||||||
|
|
||||||
Lexer spans are currently tracked as Java `String` character indices. The PBS syntax spec requires stable byte offsets. This PR aligns token/span attribution with byte offsets and keeps diagnostics deterministic.
|
|
||||||
|
|
||||||
## Motivation
|
|
||||||
|
|
||||||
Without byte offsets, diagnostics and downstream attribution diverge on non-ASCII sources, violating the lexical contract.
|
|
||||||
|
|
||||||
## Target
|
|
||||||
|
|
||||||
- `prometeu-frontend-pbs` lexer and span attribution behavior.
|
|
||||||
- Diagnostics and AST attribution consumers that depend on lexer spans.
|
|
||||||
|
|
||||||
## Scope
|
|
||||||
|
|
||||||
- Convert lexer position accounting to UTF-8 byte offsets.
|
|
||||||
- Preserve existing tokenization semantics.
|
|
||||||
- Keep parser/semantics APIs unchanged.
|
|
||||||
|
|
||||||
## Method
|
|
||||||
|
|
||||||
- Introduce byte-accurate cursor accounting in lexer scanning.
|
|
||||||
- Emit token start/end using byte offsets.
|
|
||||||
- Validate compatibility with parser and diagnostics sinks.
|
|
||||||
- Add regression fixtures with non-ASCII source content.
|
|
||||||
|
|
||||||
## Acceptance Criteria
|
|
||||||
|
|
||||||
- All emitted tokens include UTF-8 byte offsets.
|
|
||||||
- Diagnostics from lexer/parser over non-ASCII sources point to correct byte spans.
|
|
||||||
- Existing ASCII tests remain green.
|
|
||||||
- New non-ASCII span tests are added and deterministic.
|
|
||||||
|
|
||||||
## Tests
|
|
||||||
|
|
||||||
- Extend lexer tests with UTF-8 multibyte identifiers/strings.
|
|
||||||
- Add parser span-attribution tests over multibyte source.
|
|
||||||
- Run full `prometeu-frontend-pbs` test suite.
|
|
||||||
|
|
||||||
## Non-Goals
|
|
||||||
|
|
||||||
- Changing token classes or grammar.
|
|
||||||
- Changing message wording policy.
|
|
||||||
@ -27,6 +27,8 @@ public final class PbsLexer {
|
|||||||
|
|
||||||
private int start;
|
private int start;
|
||||||
private int current;
|
private int current;
|
||||||
|
private int startByte;
|
||||||
|
private int currentByte;
|
||||||
private LexerState state = LexerState.DEFAULT;
|
private LexerState state = LexerState.DEFAULT;
|
||||||
|
|
||||||
private PbsLexer(final String source, final FileId fileId, final DiagnosticSink diagnostics) {
|
private PbsLexer(final String source, final FileId fileId, final DiagnosticSink diagnostics) {
|
||||||
@ -53,7 +55,7 @@ public final class PbsLexer {
|
|||||||
case LINE_COMMENT -> scanLineCommentState();
|
case LINE_COMMENT -> scanLineCommentState();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tokens.add(new PbsToken(PbsTokenKind.EOF, "", current, current));
|
tokens.add(new PbsToken(PbsTokenKind.EOF, "", currentByte, currentByte));
|
||||||
return ReadOnlyList.wrap(tokens);
|
return ReadOnlyList.wrap(tokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -62,7 +64,8 @@ public final class PbsLexer {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
start = current;
|
start = current;
|
||||||
final char c = advance();
|
startByte = currentByte;
|
||||||
|
final int c = advance();
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case ' ', '\r', '\t', '\n' -> {
|
case ' ', '\r', '\t', '\n' -> {
|
||||||
// Deliberately ignored.
|
// Deliberately ignored.
|
||||||
@ -130,7 +133,7 @@ public final class PbsLexer {
|
|||||||
state = LexerState.IDENTIFIER;
|
state = LexerState.IDENTIFIER;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(c));
|
report(LexErrors.E_LEX_INVALID_CHAR, "Invalid character: '%s'".formatted(Character.toString(c)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -174,16 +177,16 @@ public final class PbsLexer {
|
|||||||
private void scanStringState() {
|
private void scanStringState() {
|
||||||
while (!isAtEnd() && peek() != '"') {
|
while (!isAtEnd() && peek() != '"') {
|
||||||
if (peek() == '\\') {
|
if (peek() == '\\') {
|
||||||
final var escapeStart = current;
|
final var escapeStart = currentByte;
|
||||||
advance();
|
advance();
|
||||||
if (!isAtEnd()) {
|
if (!isAtEnd()) {
|
||||||
final var escaped = advance();
|
final var escaped = advance();
|
||||||
if (!isValidStringEscape(escaped)) {
|
if (!isValidStringEscape(escaped)) {
|
||||||
report(
|
report(
|
||||||
LexErrors.E_LEX_INVALID_STRING_ESCAPE,
|
LexErrors.E_LEX_INVALID_STRING_ESCAPE,
|
||||||
"Invalid string escape '\\%s'".formatted(escaped),
|
"Invalid string escape '\\%s'".formatted(Character.toString(escaped)),
|
||||||
escapeStart,
|
escapeStart,
|
||||||
current);
|
currentByte);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
@ -213,47 +216,57 @@ public final class PbsLexer {
|
|||||||
|
|
||||||
private void addToken(final PbsTokenKind kind) {
|
private void addToken(final PbsTokenKind kind) {
|
||||||
final String lexeme = source.substring(start, current);
|
final String lexeme = source.substring(start, current);
|
||||||
tokens.add(new PbsToken(kind, lexeme, start, current));
|
tokens.add(new PbsToken(kind, lexeme, startByte, currentByte));
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean match(final char expected) {
|
private boolean match(final char expected) {
|
||||||
if (isAtEnd()) return false;
|
if (isAtEnd()) return false;
|
||||||
if (source.charAt(current) != expected) return false;
|
final var codePoint = source.codePointAt(current);
|
||||||
|
if (Character.charCount(codePoint) != 1 || codePoint != expected) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
current++;
|
current++;
|
||||||
|
currentByte++;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private char advance() {
|
private int advance() {
|
||||||
return source.charAt(current++);
|
final int codePoint = source.codePointAt(current);
|
||||||
|
current += Character.charCount(codePoint);
|
||||||
|
currentByte += utf8Bytes(codePoint);
|
||||||
|
return codePoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
private char peek() {
|
private int peek() {
|
||||||
if (isAtEnd()) return '\0';
|
if (isAtEnd()) return '\0';
|
||||||
return source.charAt(current);
|
return source.codePointAt(current);
|
||||||
}
|
}
|
||||||
|
|
||||||
private char peekNext() {
|
private int peekNext() {
|
||||||
if (current + 1 >= source.length()) return '\0';
|
if (isAtEnd()) return '\0';
|
||||||
return source.charAt(current + 1);
|
final int first = source.codePointAt(current);
|
||||||
|
final int nextIndex = current + Character.charCount(first);
|
||||||
|
if (nextIndex >= source.length()) return '\0';
|
||||||
|
return source.codePointAt(nextIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isAtEnd() {
|
private boolean isAtEnd() {
|
||||||
return current >= source.length();
|
return current >= source.length();
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isDigit(final char c) {
|
private boolean isDigit(final int c) {
|
||||||
return c >= '0' && c <= '9';
|
return c >= '0' && c <= '9';
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isIdentifierStart(final char c) {
|
private boolean isIdentifierStart(final int c) {
|
||||||
return c == '_' || Character.isAlphabetic(c);
|
return c == '_' || Character.isAlphabetic(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isIdentifierPart(final char c) {
|
private boolean isIdentifierPart(final int c) {
|
||||||
return c == '_' || Character.isAlphabetic(c) || Character.isDigit(c);
|
return c == '_' || Character.isAlphabetic(c) || Character.isDigit(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isValidStringEscape(final char escaped) {
|
private boolean isValidStringEscape(final int escaped) {
|
||||||
return escaped == '\\'
|
return escaped == '\\'
|
||||||
|| escaped == '"'
|
|| escaped == '"'
|
||||||
|| escaped == 'n'
|
|| escaped == 'n'
|
||||||
@ -262,17 +275,30 @@ public final class PbsLexer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void report(final LexErrors lexErrors, final String message) {
|
private void report(final LexErrors lexErrors, final String message) {
|
||||||
report(lexErrors, message, start, current);
|
report(lexErrors, message, startByte, currentByte);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void report(
|
private void report(
|
||||||
final LexErrors lexErrors,
|
final LexErrors lexErrors,
|
||||||
final String message,
|
final String message,
|
||||||
final long spanStart,
|
final int spanStart,
|
||||||
final long spanEnd) {
|
final int spanEnd) {
|
||||||
p.studio.compiler.source.diagnostics.Diagnostics.error(diagnostics, lexErrors.name(), message, new Span(fileId, spanStart, spanEnd));
|
p.studio.compiler.source.diagnostics.Diagnostics.error(diagnostics, lexErrors.name(), message, new Span(fileId, spanStart, spanEnd));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int utf8Bytes(final int codePoint) {
|
||||||
|
if (codePoint <= 0x7F) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (codePoint <= 0x7FF) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
if (codePoint <= 0xFFFF) {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
|
||||||
private static Map<String, PbsTokenKind> buildKeywords() {
|
private static Map<String, PbsTokenKind> buildKeywords() {
|
||||||
final var map = new HashMap<String, PbsTokenKind>();
|
final var map = new HashMap<String, PbsTokenKind>();
|
||||||
map.put("import", PbsTokenKind.IMPORT);
|
map.put("import", PbsTokenKind.IMPORT);
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
import p.studio.compiler.source.diagnostics.DiagnosticSink;
|
import p.studio.compiler.source.diagnostics.DiagnosticSink;
|
||||||
import p.studio.compiler.source.identifiers.FileId;
|
import p.studio.compiler.source.identifiers.FileId;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
@ -134,4 +135,22 @@ class PbsLexerTest {
|
|||||||
assertEquals(LexErrors.E_LEX_INVALID_CHAR.name(),
|
assertEquals(LexErrors.E_LEX_INVALID_CHAR.name(),
|
||||||
diagnostics.stream().findFirst().orElseThrow().getCode());
|
diagnostics.stream().findFirst().orElseThrow().getCode());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void shouldEmitUtf8ByteOffsetsForNonAsciiIdentifier() {
|
||||||
|
final var source = "fn cafe\u00E9() -> int { return 1; }";
|
||||||
|
final var diagnostics = DiagnosticSink.empty();
|
||||||
|
|
||||||
|
final var tokens = PbsLexer.lex(source, new FileId(0), diagnostics);
|
||||||
|
|
||||||
|
assertTrue(diagnostics.isEmpty(), "Valid non-ASCII identifier should lex without diagnostics");
|
||||||
|
|
||||||
|
final var identifier = tokens.get(1);
|
||||||
|
final int expectedStart = "fn ".getBytes(StandardCharsets.UTF_8).length;
|
||||||
|
final int expectedEnd = "fn cafe\u00E9".getBytes(StandardCharsets.UTF_8).length;
|
||||||
|
|
||||||
|
assertEquals(PbsTokenKind.IDENTIFIER, identifier.kind());
|
||||||
|
assertEquals(expectedStart, identifier.start());
|
||||||
|
assertEquals(expectedEnd, identifier.end());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -6,6 +6,8 @@ import p.studio.compiler.pbs.lexer.PbsLexer;
|
|||||||
import p.studio.compiler.source.diagnostics.DiagnosticSink;
|
import p.studio.compiler.source.diagnostics.DiagnosticSink;
|
||||||
import p.studio.compiler.source.identifiers.FileId;
|
import p.studio.compiler.source.identifiers.FileId;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
|
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
|
||||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||||
@ -262,4 +264,20 @@ class PbsParserTest {
|
|||||||
assertInstanceOf(PbsAst.InvalidDecl.class, ast.topDecls().get(0));
|
assertInstanceOf(PbsAst.InvalidDecl.class, ast.topDecls().get(0));
|
||||||
assertInstanceOf(PbsAst.FunctionDecl.class, ast.topDecls().get(1));
|
assertInstanceOf(PbsAst.FunctionDecl.class, ast.topDecls().get(1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void shouldPropagateUtf8ByteOffsetsToAstSpans() {
|
||||||
|
final var source = "fn cafe\u00E9() -> int { return 1; }";
|
||||||
|
final var diagnostics = DiagnosticSink.empty();
|
||||||
|
final var fileId = new FileId(0);
|
||||||
|
|
||||||
|
final PbsAst.File ast = PbsParser.parse(PbsLexer.lex(source, fileId, diagnostics), fileId, diagnostics);
|
||||||
|
|
||||||
|
assertTrue(diagnostics.isEmpty(), "Valid non-ASCII source should parse without diagnostics");
|
||||||
|
|
||||||
|
final int expectedEnd = source.getBytes(StandardCharsets.UTF_8).length;
|
||||||
|
final var fn = ast.functions().getFirst();
|
||||||
|
assertEquals(expectedEnd, ast.span().getEnd());
|
||||||
|
assertEquals(expectedEnd, fn.span().getEnd());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user