diff --git a/pyproject.toml b/pyproject.toml index 73afa8d..4d287de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "foreignthon" -version = "0.5.3" +version = "0.5.4" description = "Write Python in any language. Transpiles foreign-language .xx.py files to standard Python." license = { text = "GPL v3" } requires-python = ">=3.9" diff --git a/src/foreignthon/transpiler.py b/src/foreignthon/transpiler.py index 1cce7de..11c6db1 100644 --- a/src/foreignthon/transpiler.py +++ b/src/foreignthon/transpiler.py @@ -3,6 +3,8 @@ from __future__ import annotations import io import re import tokenize +import unicodedata +from functools import lru_cache from pathlib import Path from .pack import load_pack @@ -79,7 +81,103 @@ def _get_slice(source_lines: list[str], sr: int, sc: int, er: int, ec: int) -> s return "".join(parts) +@lru_cache(maxsize=2048) +def _is_safe_token(s: str) -> bool: + """Return True if the tokenizer produces `s` as a single NAME token. + + Some Unicode scripts (e.g. Tamil) contain combining characters that the + tokenize module's regex treats as token boundaries, even though the full + string passes str.isidentifier(). Keys that fail this check need a + pre-pass string replacement before tokenization. + """ + try: + toks = [ + t for t in tokenize.generate_tokens(io.StringIO(s + "\n").readline) + if t.type not in ( + tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL, + tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT, + ) + ] + return len(toks) == 1 and toks[0].type == tokenize.NAME and toks[0].string == s + except tokenize.TokenError: + return False + + +def _is_id_char(c: str) -> bool: + """True if `c` can be part of an identifier (letter, mark, digit, or underscore).""" + cat = unicodedata.category(c) + return cat.startswith(("L", "M", "N")) or c == "_" + + +def _code_region_replace(source: str, unsafe_mapping: dict) -> str: + """Replace unsafe keys in source only in code regions (not string literals or comments). + + The tokenizer correctly identifies STRING/COMMENT boundaries even when NAME + tokens are mangled by combining characters, so we use it to find protected spans. + Identifier-boundary checks prevent partial matches inside longer words. + """ + lines = source.splitlines(keepends=True) + cumlen = [0] + for line in lines: + cumlen.append(cumlen[-1] + len(line)) + + protected: list[tuple[int, int]] = [] + try: + for tok in tokenize.generate_tokens(io.StringIO(source).readline): + if tok.type in (tokenize.STRING, tokenize.COMMENT): + sr, sc = tok.start + er, ec = tok.end + protected.append((cumlen[sr - 1] + sc, cumlen[er - 1] + ec)) + except tokenize.TokenError: + pass + + sorted_keys = sorted(unsafe_mapping, key=len, reverse=True) + result: list[str] = [] + pos = 0 + span_idx = 0 + n = len(source) + + while pos < n: + if span_idx < len(protected) and pos >= protected[span_idx][0]: + end = protected[span_idx][1] + result.append(source[pos:end]) + pos = end + span_idx += 1 + continue + + code_end = protected[span_idx][0] if span_idx < len(protected) else n + + matched = False + for k in sorted_keys: + klen = len(k) + if pos + klen > code_end: + continue + if source[pos:pos + klen] != k: + continue + before_ok = pos == 0 or not _is_id_char(source[pos - 1]) + after_ok = (pos + klen >= n) or not _is_id_char(source[pos + klen]) + if before_ok and after_ok: + result.append(unsafe_mapping[k]) + pos += klen + matched = True + break + + if not matched: + result.append(source[pos]) + pos += 1 + + return "".join(result) + + def _swap_tokens(source: str, mapping: dict) -> str: + safe_mapping: dict[str, str] = {} + unsafe_mapping: dict[str, str] = {} + for k, v in mapping.items(): + (safe_mapping if _is_safe_token(k) else unsafe_mapping)[k] = v + + if unsafe_mapping: + source = _code_region_replace(source, unsafe_mapping) + source_lines = source.splitlines(keepends=True) tokens = list(tokenize.generate_tokens(io.StringIO(source).readline)) @@ -94,8 +192,8 @@ def _swap_tokens(source: str, mapping: dict) -> str: gap = _get_slice(source_lines, prev_end[0], prev_end[1], s_row, s_col) result.append(gap) - if tok_type == tokenize.NAME and tok_string in mapping: - result.append(mapping[tok_string]) + if tok_type == tokenize.NAME and tok_string in safe_mapping: + result.append(safe_mapping[tok_string]) else: result.append(tok_string)