replit agent fixes for languages like tamil

2026-05-21 15:30:10 +00:00
parent e50b2754ae
commit 784b3729e4
1 changed files with 100 additions and 2 deletions
--- a/src/foreignthon/transpiler.py
+++ b/src/foreignthon/transpiler.py
@@ -3,6 +3,8 @@ from __future__ import annotations
 import io
 import re
 import tokenize
+import unicodedata
+from functools import lru_cache
 from pathlib import Path

 from .pack import load_pack
@@ -79,7 +81,103 @@ def _get_slice(source_lines: list[str], sr: int, sc: int, er: int, ec: int) -> s
    return "".join(parts)


+@lru_cache(maxsize=2048)
+def _is_safe_token(s: str) -> bool:
+    """Return True if the tokenizer produces `s` as a single NAME token.
+
+    Some Unicode scripts (e.g. Tamil) contain combining characters that the
+    tokenize module's regex treats as token boundaries, even though the full
+    string passes str.isidentifier().  Keys that fail this check need a
+    pre-pass string replacement before tokenization.
+    """
+    try:
+        toks = [
+            t for t in tokenize.generate_tokens(io.StringIO(s + "\n").readline)
+            if t.type not in (
+                tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL,
+                tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT,
+            )
+        ]
+        return len(toks) == 1 and toks[0].type == tokenize.NAME and toks[0].string == s
+    except tokenize.TokenError:
+        return False
+
+
+def _is_id_char(c: str) -> bool:
+    """True if `c` can be part of an identifier (letter, mark, digit, or underscore)."""
+    cat = unicodedata.category(c)
+    return cat.startswith(("L", "M", "N")) or c == "_"
+
+
+def _code_region_replace(source: str, unsafe_mapping: dict) -> str:
+    """Replace unsafe keys in source only in code regions (not string literals or comments).
+
+    The tokenizer correctly identifies STRING/COMMENT boundaries even when NAME
+    tokens are mangled by combining characters, so we use it to find protected spans.
+    Identifier-boundary checks prevent partial matches inside longer words.
+    """
+    lines = source.splitlines(keepends=True)
+    cumlen = [0]
+    for line in lines:
+        cumlen.append(cumlen[-1] + len(line))
+
+    protected: list[tuple[int, int]] = []
+    try:
+        for tok in tokenize.generate_tokens(io.StringIO(source).readline):
+            if tok.type in (tokenize.STRING, tokenize.COMMENT):
+                sr, sc = tok.start
+                er, ec = tok.end
+                protected.append((cumlen[sr - 1] + sc, cumlen[er - 1] + ec))
+    except tokenize.TokenError:
+        pass
+
+    sorted_keys = sorted(unsafe_mapping, key=len, reverse=True)
+    result: list[str] = []
+    pos = 0
+    span_idx = 0
+    n = len(source)
+
+    while pos < n:
+        if span_idx < len(protected) and pos >= protected[span_idx][0]:
+            end = protected[span_idx][1]
+            result.append(source[pos:end])
+            pos = end
+            span_idx += 1
+            continue
+
+        code_end = protected[span_idx][0] if span_idx < len(protected) else n
+
+        matched = False
+        for k in sorted_keys:
+            klen = len(k)
+            if pos + klen > code_end:
+                continue
+            if source[pos:pos + klen] != k:
+                continue
+            before_ok = pos == 0 or not _is_id_char(source[pos - 1])
+            after_ok = (pos + klen >= n) or not _is_id_char(source[pos + klen])
+            if before_ok and after_ok:
+                result.append(unsafe_mapping[k])
+                pos += klen
+                matched = True
+                break
+
+        if not matched:
+            result.append(source[pos])
+            pos += 1
+
+    return "".join(result)
+
+
 def _swap_tokens(source: str, mapping: dict) -> str:
+    safe_mapping: dict[str, str] = {}
+    unsafe_mapping: dict[str, str] = {}
+    for k, v in mapping.items():
+        (safe_mapping if _is_safe_token(k) else unsafe_mapping)[k] = v
+
+    if unsafe_mapping:
+        source = _code_region_replace(source, unsafe_mapping)
+
    source_lines = source.splitlines(keepends=True)
    tokens = list(tokenize.generate_tokens(io.StringIO(source).readline))

@@ -94,8 +192,8 @@ def _swap_tokens(source: str, mapping: dict) -> str:
        gap = _get_slice(source_lines, prev_end[0], prev_end[1], s_row, s_col)
        result.append(gap)

-        if tok_type == tokenize.NAME and tok_string in mapping:
-            result.append(mapping[tok_string])
+        if tok_type == tokenize.NAME and tok_string in safe_mapping:
+            result.append(safe_mapping[tok_string])
        else:
            result.append(tok_string)