pr 03

2026-02-05 15:05:16 +00:00 · 2026-02-05 15:05:16 +00:00 · 0e76368cba
commit 0e76368cba
parent 07f986df5b
3 changed files with 172 additions and 0 deletions
--- a/crates/prometeu-analysis/src/lib.rs
+++ b/crates/prometeu-analysis/src/lib.rs
@ -2,8 +2,10 @@ pub mod ids;
 pub mod span;
 pub mod file_db;
 pub mod interner;
 pub mod text_index;
 pub use ids::*;
 pub use span::Span;
 pub use file_db::{FileDB, LineIndex};
 pub use interner::NameInterner;
 pub use text_index::TextIndex;
--- a/crates/prometeu-analysis/src/text_index.rs
+++ b/crates/prometeu-analysis/src/text_index.rs
@ -0,0 +1,113 @@
 /// TextIndex fornece conversões entre offsets em bytes (usados no core)
 /// e posições LSP (linha, coluna em unidades UTF-16).
 ///
 /// Observações:
 /// - `line_starts` guarda os offsets em bytes do início de cada linha.
 /// - Mantemos uma cópia do texto para permitir conversões sem dependências externas.
 /// - A coluna LSP é contada em unidades UTF-16, excluindo o `\n` do fim de linha.
 #[derive(Clone, Debug)]
 pub struct TextIndex {
    text: String,
    line_starts: Vec<u32>,
 }
 impl TextIndex {
    /// Constrói o índice a partir do conteúdo textual atual do arquivo.
    pub fn new(text: &str) -> Self {
        let mut line_starts = Vec::with_capacity(128);
        line_starts.push(0);
        for (byte, ch) in text.char_indices() {
            if ch == '\n' {
                // início da próxima linha é o byte após o '\n'
                line_starts.push((byte + 1) as u32);
            }
        }
        Self {
            text: text.to_string(),
            line_starts,
        }
    }
    /// Número de linhas (0-based; linhas vazias contam).
    #[inline]
    pub fn line_count(&self) -> u32 {
        self.line_starts.len() as u32
    }
    /// Converte um offset em bytes (dentro do arquivo) para (linha, coluna UTF-16) no padrão LSP.
    ///
    /// Para offsets exatamente no fim da linha, a coluna será o comprimento UTF-16 da linha.
    pub fn byte_to_lsp(&self, byte: u32) -> (u32, u32) {
        let byte = byte.min(self.text.len() as u32);
        let line = match self.line_starts.binary_search(&byte) {
            Ok(i) => i as u32,
            Err(i) => (i.saturating_sub(1)) as u32,
        };
        let (line_start, line_end) = self.line_bounds(line);
        let rel = byte.saturating_sub(line_start as u32) as usize;
        let slice = &self.text[line_start..line_end];
        let mut utf16_col: u32 = 0;
        for (i, ch) in slice.char_indices() {
            if i >= rel { break; }
            utf16_col += ch.len_utf16() as u32;
        }
        (line, utf16_col)
    }
    /// Converte (linha, coluna UTF-16) em offset em bytes.
    ///
    /// - Linhas fora do intervalo são clampadas para [0, last].
    /// - Colunas maiores que o tamanho UTF-16 da linha retornam o fim da linha.
    pub fn lsp_to_byte(&self, line: u32, utf16_col: u32) -> u32 {
        let line = line.min(self.line_count().saturating_sub(1));
        let (line_start, line_end) = self.line_bounds(line);
        let slice = &self.text[line_start..line_end];
        let mut acc: u32 = 0;
        for (i, ch) in slice.char_indices() {
            if acc >= utf16_col {
                return (line_start + i) as u32;
            }
            acc += ch.len_utf16() as u32;
        }
        // Se a coluna alvo é após o último caractere, retorne o fim da linha.
        line_end as u32
    }
    #[inline]
    fn line_bounds(&self, line: u32) -> (usize, usize) {
        let start = *self
            .line_starts
            .get(line as usize)
            .unwrap_or(self.line_starts.last().unwrap());
        let next = self.line_starts.get(line as usize + 1).copied();
        // Se existe próxima linha, `next` aponta para o byte após o '\n' da linha atual
        // então o conteúdo termina em `next - 1`. Caso contrário (última linha),
        // o conteúdo termina em `text.len()`.
        let end = match next {
            Some(next_start) => next_start.saturating_sub(1),
            None => self.text.len() as u32,
        };
        (start as usize, end as usize)
    }
 }
 #[cfg(test)]
 mod tests_internal {
    use super::*;
    #[test]
    fn line_bounds_basic() {
        let s = "ab\ncd\n";
        let idx = TextIndex::new(s);
        assert_eq!(idx.line_count(), 3);
        // linha 0: "ab"
        assert_eq!(idx.line_bounds(0), (0, 2));
        // linha 1: "cd"
        assert_eq!(idx.line_bounds(1), (3, 5));
        // linha 2: linha vazia final
        assert_eq!(idx.line_bounds(2), (6, 6));
    }
 }
--- a/crates/prometeu-analysis/tests/text_index_tests.rs
+++ b/crates/prometeu-analysis/tests/text_index_tests.rs
@ -0,0 +1,57 @@
 use prometeu_analysis::TextIndex;
 #[test]
 fn text_index_ascii_roundtrip() {
    let text = "hello\nworld\nthis is ascii";
    let idx = TextIndex::new(text);
    // Verifica round-trip em todas as fronteiras de char
    let mut boundaries: Vec<usize> = text.char_indices().map(|(i, _)| i).collect();
    boundaries.push(text.len());
    for &b in &boundaries {
        let (line, col16) = idx.byte_to_lsp(b as u32);
        let b2 = idx.lsp_to_byte(line, col16);
        assert_eq!(b2, b as u32, "roundtrip falhou para byte {} -> (l={},c16={})", b, line, col16);
    }
    // Alguns checks diretos
    // início: (0,0)
    assert_eq!(idx.byte_to_lsp(0), (0, 0));
    // após "hello" (5), antes do '\n': linha 0, col=5
    assert_eq!(idx.byte_to_lsp(5), (0, 5));
    // após '\n' (6): linha 1, col=0
    assert_eq!(idx.byte_to_lsp(6), (1, 0));
 }
 #[test]
 fn text_index_unicode_roundtrip_utf16() {
    // "a" (1B, 1u16), "é" (2B, 1u16), "🙂" (4B, 2u16), "b" (1B, 1u16)
    let text = "aé🙂b";
    let idx = TextIndex::new(text);
    // fronteiras de char + fim
    let mut boundaries: Vec<usize> = text.char_indices().map(|(i, _)| i).collect();
    boundaries.push(text.len());
    for &b in &boundaries {
        let (line, col16) = idx.byte_to_lsp(b as u32);
        let b2 = idx.lsp_to_byte(line, col16);
        assert_eq!(b2, b as u32, "roundtrip unicode falhou para byte {} -> (l={},c16={})", b, line, col16);
    }
    // Checagens de colunas esperadas na linha 0
    // bytes: [0:'a'][1..2:'é'][3..6:'🙂'][7:'b'][8:end]
    assert_eq!(idx.byte_to_lsp(0), (0, 0)); // antes de 'a'
    assert_eq!(idx.byte_to_lsp(1), (0, 1)); // após 'a'
    assert_eq!(idx.byte_to_lsp(3), (0, 2)); // após 'a' + 'é' (1+1 utf16)
    assert_eq!(idx.byte_to_lsp(7), (0, 4)); // após '🙂' (2 utf16) => 1+1+2=4
    assert_eq!(idx.byte_to_lsp(8), (0, 5)); // após 'b'
    // e inverso, colunas específicas
    assert_eq!(idx.lsp_to_byte(0, 0), 0);
    assert_eq!(idx.lsp_to_byte(0, 1), 1);
    assert_eq!(idx.lsp_to_byte(0, 2), 3);
    assert_eq!(idx.lsp_to_byte(0, 4), 7);
    assert_eq!(idx.lsp_to_byte(0, 5), 8);
 }