Merge pull request 'fix/tokenizer-bug' (#1) from fix/tokenizer-bug into main

Reviewed-on: #1
2026-05-21 15:32:46 +00:00
parent e50b2754ae d4b931ec2c
commit 21107a1d43
2 changed files with 101 additions and 3 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "foreignthon"
-version = "0.5.3"
+version = "0.5.4"
 description = "Write Python in any language. Transpiles foreign-language .xx.py files to standard Python."
 license = { text = "GPL v3" }
 requires-python = ">=3.9"
--- a/src/foreignthon/transpiler.py
+++ b/src/foreignthon/transpiler.py
@@ -3,6 +3,8 @@ from __future__ import annotations
 import io
 import re
 import tokenize
 import unicodedata
 from functools import lru_cache
 from pathlib import Path
 from .pack import load_pack
@@ -79,7 +81,103 @@ def _get_slice(source_lines: list[str], sr: int, sc: int, er: int, ec: int) -> s
    return "".join(parts)
@lru_cache(maxsize=2048)
 def _is_safe_token(s: str) -> bool:
    """Return True if the tokenizer produces `s` as a single NAME token.
    Some Unicode scripts (e.g. Tamil) contain combining characters that the
    tokenize module's regex treats as token boundaries, even though the full
    string passes str.isidentifier().  Keys that fail this check need a
    pre-pass string replacement before tokenization.
    """
    try:
        toks = [
            t for t in tokenize.generate_tokens(io.StringIO(s + "\n").readline)
            if t.type not in (
                tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL,
                tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT,
            )
        ]
        return len(toks) == 1 and toks[0].type == tokenize.NAME and toks[0].string == s
    except tokenize.TokenError:
        return False
 def _is_id_char(c: str) -> bool:
    """True if `c` can be part of an identifier (letter, mark, digit, or underscore)."""
    cat = unicodedata.category(c)
    return cat.startswith(("L", "M", "N")) or c == "_"
 def _code_region_replace(source: str, unsafe_mapping: dict) -> str:
    """Replace unsafe keys in source only in code regions (not string literals or comments).
    The tokenizer correctly identifies STRING/COMMENT boundaries even when NAME
    tokens are mangled by combining characters, so we use it to find protected spans.
    Identifier-boundary checks prevent partial matches inside longer words.
    """
    lines = source.splitlines(keepends=True)
    cumlen = [0]
    for line in lines:
        cumlen.append(cumlen[-1] + len(line))
    protected: list[tuple[int, int]] = []
    try:
        for tok in tokenize.generate_tokens(io.StringIO(source).readline):
            if tok.type in (tokenize.STRING, tokenize.COMMENT):
                sr, sc = tok.start
                er, ec = tok.end
                protected.append((cumlen[sr - 1] + sc, cumlen[er - 1] + ec))
    except tokenize.TokenError:
        pass
    sorted_keys = sorted(unsafe_mapping, key=len, reverse=True)
    result: list[str] = []
    pos = 0
    span_idx = 0
    n = len(source)
    while pos < n:
        if span_idx < len(protected) and pos >= protected[span_idx][0]:
            end = protected[span_idx][1]
            result.append(source[pos:end])
            pos = end
            span_idx += 1
            continue
        code_end = protected[span_idx][0] if span_idx < len(protected) else n
        matched = False
        for k in sorted_keys:
            klen = len(k)
            if pos + klen > code_end:
                continue
            if source[pos:pos + klen] != k:
                continue
            before_ok = pos == 0 or not _is_id_char(source[pos - 1])
            after_ok = (pos + klen >= n) or not _is_id_char(source[pos + klen])
            if before_ok and after_ok:
                result.append(unsafe_mapping[k])
                pos += klen
                matched = True
                break
        if not matched:
            result.append(source[pos])
            pos += 1
    return "".join(result)
 def _swap_tokens(source: str, mapping: dict) -> str:
    safe_mapping: dict[str, str] = {}
    unsafe_mapping: dict[str, str] = {}
    for k, v in mapping.items():
        (safe_mapping if _is_safe_token(k) else unsafe_mapping)[k] = v
    if unsafe_mapping:
        source = _code_region_replace(source, unsafe_mapping)
    source_lines = source.splitlines(keepends=True)
    tokens = list(tokenize.generate_tokens(io.StringIO(source).readline))
@@ -94,8 +192,8 @@ def _swap_tokens(source: str, mapping: dict) -> str:
        gap = _get_slice(source_lines, prev_end[0], prev_end[1], s_row, s_col)
        result.append(gap)
-        if tok_type == tokenize.NAME and tok_string in mapping:
+        if tok_type == tokenize.NAME and tok_string in safe_mapping:
-            result.append(mapping[tok_string])
+            result.append(safe_mapping[tok_string])
        else:
            result.append(tok_string)