replit agent fixes for languages like tamil
This commit is contained in:
@@ -3,6 +3,8 @@ from __future__ import annotations
|
||||
import io
|
||||
import re
|
||||
import tokenize
|
||||
import unicodedata
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
from .pack import load_pack
|
||||
@@ -79,7 +81,103 @@ def _get_slice(source_lines: list[str], sr: int, sc: int, er: int, ec: int) -> s
|
||||
return "".join(parts)
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def _is_safe_token(s: str) -> bool:
|
||||
"""Return True if the tokenizer produces `s` as a single NAME token.
|
||||
|
||||
Some Unicode scripts (e.g. Tamil) contain combining characters that the
|
||||
tokenize module's regex treats as token boundaries, even though the full
|
||||
string passes str.isidentifier(). Keys that fail this check need a
|
||||
pre-pass string replacement before tokenization.
|
||||
"""
|
||||
try:
|
||||
toks = [
|
||||
t for t in tokenize.generate_tokens(io.StringIO(s + "\n").readline)
|
||||
if t.type not in (
|
||||
tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL,
|
||||
tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT,
|
||||
)
|
||||
]
|
||||
return len(toks) == 1 and toks[0].type == tokenize.NAME and toks[0].string == s
|
||||
except tokenize.TokenError:
|
||||
return False
|
||||
|
||||
|
||||
def _is_id_char(c: str) -> bool:
|
||||
"""True if `c` can be part of an identifier (letter, mark, digit, or underscore)."""
|
||||
cat = unicodedata.category(c)
|
||||
return cat.startswith(("L", "M", "N")) or c == "_"
|
||||
|
||||
|
||||
def _code_region_replace(source: str, unsafe_mapping: dict) -> str:
|
||||
"""Replace unsafe keys in source only in code regions (not string literals or comments).
|
||||
|
||||
The tokenizer correctly identifies STRING/COMMENT boundaries even when NAME
|
||||
tokens are mangled by combining characters, so we use it to find protected spans.
|
||||
Identifier-boundary checks prevent partial matches inside longer words.
|
||||
"""
|
||||
lines = source.splitlines(keepends=True)
|
||||
cumlen = [0]
|
||||
for line in lines:
|
||||
cumlen.append(cumlen[-1] + len(line))
|
||||
|
||||
protected: list[tuple[int, int]] = []
|
||||
try:
|
||||
for tok in tokenize.generate_tokens(io.StringIO(source).readline):
|
||||
if tok.type in (tokenize.STRING, tokenize.COMMENT):
|
||||
sr, sc = tok.start
|
||||
er, ec = tok.end
|
||||
protected.append((cumlen[sr - 1] + sc, cumlen[er - 1] + ec))
|
||||
except tokenize.TokenError:
|
||||
pass
|
||||
|
||||
sorted_keys = sorted(unsafe_mapping, key=len, reverse=True)
|
||||
result: list[str] = []
|
||||
pos = 0
|
||||
span_idx = 0
|
||||
n = len(source)
|
||||
|
||||
while pos < n:
|
||||
if span_idx < len(protected) and pos >= protected[span_idx][0]:
|
||||
end = protected[span_idx][1]
|
||||
result.append(source[pos:end])
|
||||
pos = end
|
||||
span_idx += 1
|
||||
continue
|
||||
|
||||
code_end = protected[span_idx][0] if span_idx < len(protected) else n
|
||||
|
||||
matched = False
|
||||
for k in sorted_keys:
|
||||
klen = len(k)
|
||||
if pos + klen > code_end:
|
||||
continue
|
||||
if source[pos:pos + klen] != k:
|
||||
continue
|
||||
before_ok = pos == 0 or not _is_id_char(source[pos - 1])
|
||||
after_ok = (pos + klen >= n) or not _is_id_char(source[pos + klen])
|
||||
if before_ok and after_ok:
|
||||
result.append(unsafe_mapping[k])
|
||||
pos += klen
|
||||
matched = True
|
||||
break
|
||||
|
||||
if not matched:
|
||||
result.append(source[pos])
|
||||
pos += 1
|
||||
|
||||
return "".join(result)
|
||||
|
||||
|
||||
def _swap_tokens(source: str, mapping: dict) -> str:
|
||||
safe_mapping: dict[str, str] = {}
|
||||
unsafe_mapping: dict[str, str] = {}
|
||||
for k, v in mapping.items():
|
||||
(safe_mapping if _is_safe_token(k) else unsafe_mapping)[k] = v
|
||||
|
||||
if unsafe_mapping:
|
||||
source = _code_region_replace(source, unsafe_mapping)
|
||||
|
||||
source_lines = source.splitlines(keepends=True)
|
||||
tokens = list(tokenize.generate_tokens(io.StringIO(source).readline))
|
||||
|
||||
@@ -94,8 +192,8 @@ def _swap_tokens(source: str, mapping: dict) -> str:
|
||||
gap = _get_slice(source_lines, prev_end[0], prev_end[1], s_row, s_col)
|
||||
result.append(gap)
|
||||
|
||||
if tok_type == tokenize.NAME and tok_string in mapping:
|
||||
result.append(mapping[tok_string])
|
||||
if tok_type == tokenize.NAME and tok_string in safe_mapping:
|
||||
result.append(safe_mapping[tok_string])
|
||||
else:
|
||||
result.append(tok_string)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user