Merge pull request 'fix/tokenizer-bug' (#1) from fix/tokenizer-bug into main
All checks were successful
CI / test (push) Successful in 11s

Reviewed-on: #1
This commit was merged in pull request #1.
This commit is contained in:
2026-05-21 15:32:46 +00:00
2 changed files with 101 additions and 3 deletions

View File

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project] [project]
name = "foreignthon" name = "foreignthon"
version = "0.5.3" version = "0.5.4"
description = "Write Python in any language. Transpiles foreign-language .xx.py files to standard Python." description = "Write Python in any language. Transpiles foreign-language .xx.py files to standard Python."
license = { text = "GPL v3" } license = { text = "GPL v3" }
requires-python = ">=3.9" requires-python = ">=3.9"

View File

@@ -3,6 +3,8 @@ from __future__ import annotations
import io import io
import re import re
import tokenize import tokenize
import unicodedata
from functools import lru_cache
from pathlib import Path from pathlib import Path
from .pack import load_pack from .pack import load_pack
@@ -79,7 +81,103 @@ def _get_slice(source_lines: list[str], sr: int, sc: int, er: int, ec: int) -> s
return "".join(parts) return "".join(parts)
@lru_cache(maxsize=2048)
def _is_safe_token(s: str) -> bool:
"""Return True if the tokenizer produces `s` as a single NAME token.
Some Unicode scripts (e.g. Tamil) contain combining characters that the
tokenize module's regex treats as token boundaries, even though the full
string passes str.isidentifier(). Keys that fail this check need a
pre-pass string replacement before tokenization.
"""
try:
toks = [
t for t in tokenize.generate_tokens(io.StringIO(s + "\n").readline)
if t.type not in (
tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL,
tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT,
)
]
return len(toks) == 1 and toks[0].type == tokenize.NAME and toks[0].string == s
except tokenize.TokenError:
return False
def _is_id_char(c: str) -> bool:
"""True if `c` can be part of an identifier (letter, mark, digit, or underscore)."""
cat = unicodedata.category(c)
return cat.startswith(("L", "M", "N")) or c == "_"
def _code_region_replace(source: str, unsafe_mapping: dict) -> str:
"""Replace unsafe keys in source only in code regions (not string literals or comments).
The tokenizer correctly identifies STRING/COMMENT boundaries even when NAME
tokens are mangled by combining characters, so we use it to find protected spans.
Identifier-boundary checks prevent partial matches inside longer words.
"""
lines = source.splitlines(keepends=True)
cumlen = [0]
for line in lines:
cumlen.append(cumlen[-1] + len(line))
protected: list[tuple[int, int]] = []
try:
for tok in tokenize.generate_tokens(io.StringIO(source).readline):
if tok.type in (tokenize.STRING, tokenize.COMMENT):
sr, sc = tok.start
er, ec = tok.end
protected.append((cumlen[sr - 1] + sc, cumlen[er - 1] + ec))
except tokenize.TokenError:
pass
sorted_keys = sorted(unsafe_mapping, key=len, reverse=True)
result: list[str] = []
pos = 0
span_idx = 0
n = len(source)
while pos < n:
if span_idx < len(protected) and pos >= protected[span_idx][0]:
end = protected[span_idx][1]
result.append(source[pos:end])
pos = end
span_idx += 1
continue
code_end = protected[span_idx][0] if span_idx < len(protected) else n
matched = False
for k in sorted_keys:
klen = len(k)
if pos + klen > code_end:
continue
if source[pos:pos + klen] != k:
continue
before_ok = pos == 0 or not _is_id_char(source[pos - 1])
after_ok = (pos + klen >= n) or not _is_id_char(source[pos + klen])
if before_ok and after_ok:
result.append(unsafe_mapping[k])
pos += klen
matched = True
break
if not matched:
result.append(source[pos])
pos += 1
return "".join(result)
def _swap_tokens(source: str, mapping: dict) -> str: def _swap_tokens(source: str, mapping: dict) -> str:
safe_mapping: dict[str, str] = {}
unsafe_mapping: dict[str, str] = {}
for k, v in mapping.items():
(safe_mapping if _is_safe_token(k) else unsafe_mapping)[k] = v
if unsafe_mapping:
source = _code_region_replace(source, unsafe_mapping)
source_lines = source.splitlines(keepends=True) source_lines = source.splitlines(keepends=True)
tokens = list(tokenize.generate_tokens(io.StringIO(source).readline)) tokens = list(tokenize.generate_tokens(io.StringIO(source).readline))
@@ -94,8 +192,8 @@ def _swap_tokens(source: str, mapping: dict) -> str:
gap = _get_slice(source_lines, prev_end[0], prev_end[1], s_row, s_col) gap = _get_slice(source_lines, prev_end[0], prev_end[1], s_row, s_col)
result.append(gap) result.append(gap)
if tok_type == tokenize.NAME and tok_string in mapping: if tok_type == tokenize.NAME and tok_string in safe_mapping:
result.append(mapping[tok_string]) result.append(safe_mapping[tok_string])
else: else:
result.append(tok_string) result.append(tok_string)