Merge pull request 'fix/tokenizer-bug' (#1 ) from fix/tokenizer-bug into main

Reviewed-on: #1
bumped version
2026-05-21 15:32:46 +00:00 · 2026-05-21 15:31:29 +00:00 · 2026-05-21 15:30:10 +00:00 · 2026-05-20 17:29:55 -05:00
3 changed files with 157 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -1 +1,56 @@
-# foreignthon
+# ForeignThon
+
+Write Python in any human language.
+
+ForeignThon transpiles `.es.py`, `.ta.py` and more into standard Python — keywords, builtins, exceptions, all translated. Errors come back in your language too.
+
+```python
+# main.es.py
+def saludar(nombre):
+    retornar f"Hola, {nombre}!"
+
+para i en dist(3):
+    escribir(saludar(f"mundo {i}"))
+```
+
+```bash
+fpy run main.es.py
+# Hola, mundo 0!
+# Hola, mundo 1!
+# Hola, mundo 2!
+```
+
+## Install
+
+```bash
+pip install foreignthon      # Core 
+pip install foreignthon-es   # Spanish
+pip install foreignthon-ta   # Tamil
+```
+
+## Quick start
+
+```bash
+fpy new myproject --lang es
+cd myproject
+fpy run src/main.es.py
+```
+
+## Commands
+
+| Command | Description |
+|---|---|
+| `fpy new <name> --lang <code>` | Scaffold a new project |
+| `fpy run <file>` | Transpile and run |
+| `fpy compile <file>` | Transpile to `.compiled.py` |
+| `fpy decompile <file> --lang <code>` | Convert Python back to a language |
+| `fpy check <file>` | Validate without running |
+
+## Documentation
+
+→ [fpy.keshavanand.net](https://foreignthon.keshavanand.net)
+
+## License
+
+GPL v3
+
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"

 [project]
 name = "foreignthon"
-version = "0.5.3"
+version = "0.5.4"
 description = "Write Python in any language. Transpiles foreign-language .xx.py files to standard Python."
 license = { text = "GPL v3" }
 requires-python = ">=3.9"
--- a/src/foreignthon/transpiler.py
+++ b/src/foreignthon/transpiler.py
@@ -3,6 +3,8 @@ from __future__ import annotations
 import io
 import re
 import tokenize
+import unicodedata
+from functools import lru_cache
 from pathlib import Path

 from .pack import load_pack
@@ -79,7 +81,103 @@ def _get_slice(source_lines: list[str], sr: int, sc: int, er: int, ec: int) -> s
    return "".join(parts)


+@lru_cache(maxsize=2048)
+def _is_safe_token(s: str) -> bool:
+    """Return True if the tokenizer produces `s` as a single NAME token.
+
+    Some Unicode scripts (e.g. Tamil) contain combining characters that the
+    tokenize module's regex treats as token boundaries, even though the full
+    string passes str.isidentifier().  Keys that fail this check need a
+    pre-pass string replacement before tokenization.
+    """
+    try:
+        toks = [
+            t for t in tokenize.generate_tokens(io.StringIO(s + "\n").readline)
+            if t.type not in (
+                tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL,
+                tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT,
+            )
+        ]
+        return len(toks) == 1 and toks[0].type == tokenize.NAME and toks[0].string == s
+    except tokenize.TokenError:
+        return False
+
+
+def _is_id_char(c: str) -> bool:
+    """True if `c` can be part of an identifier (letter, mark, digit, or underscore)."""
+    cat = unicodedata.category(c)
+    return cat.startswith(("L", "M", "N")) or c == "_"
+
+
+def _code_region_replace(source: str, unsafe_mapping: dict) -> str:
+    """Replace unsafe keys in source only in code regions (not string literals or comments).
+
+    The tokenizer correctly identifies STRING/COMMENT boundaries even when NAME
+    tokens are mangled by combining characters, so we use it to find protected spans.
+    Identifier-boundary checks prevent partial matches inside longer words.
+    """
+    lines = source.splitlines(keepends=True)
+    cumlen = [0]
+    for line in lines:
+        cumlen.append(cumlen[-1] + len(line))
+
+    protected: list[tuple[int, int]] = []
+    try:
+        for tok in tokenize.generate_tokens(io.StringIO(source).readline):
+            if tok.type in (tokenize.STRING, tokenize.COMMENT):
+                sr, sc = tok.start
+                er, ec = tok.end
+                protected.append((cumlen[sr - 1] + sc, cumlen[er - 1] + ec))
+    except tokenize.TokenError:
+        pass
+
+    sorted_keys = sorted(unsafe_mapping, key=len, reverse=True)
+    result: list[str] = []
+    pos = 0
+    span_idx = 0
+    n = len(source)
+
+    while pos < n:
+        if span_idx < len(protected) and pos >= protected[span_idx][0]:
+            end = protected[span_idx][1]
+            result.append(source[pos:end])
+            pos = end
+            span_idx += 1
+            continue
+
+        code_end = protected[span_idx][0] if span_idx < len(protected) else n
+
+        matched = False
+        for k in sorted_keys:
+            klen = len(k)
+            if pos + klen > code_end:
+                continue
+            if source[pos:pos + klen] != k:
+                continue
+            before_ok = pos == 0 or not _is_id_char(source[pos - 1])
+            after_ok = (pos + klen >= n) or not _is_id_char(source[pos + klen])
+            if before_ok and after_ok:
+                result.append(unsafe_mapping[k])
+                pos += klen
+                matched = True
+                break
+
+        if not matched:
+            result.append(source[pos])
+            pos += 1
+
+    return "".join(result)
+
+
 def _swap_tokens(source: str, mapping: dict) -> str:
+    safe_mapping: dict[str, str] = {}
+    unsafe_mapping: dict[str, str] = {}
+    for k, v in mapping.items():
+        (safe_mapping if _is_safe_token(k) else unsafe_mapping)[k] = v
+
+    if unsafe_mapping:
+        source = _code_region_replace(source, unsafe_mapping)
+
    source_lines = source.splitlines(keepends=True)
    tokens = list(tokenize.generate_tokens(io.StringIO(source).readline))

@@ -94,8 +192,8 @@ def _swap_tokens(source: str, mapping: dict) -> str:
        gap = _get_slice(source_lines, prev_end[0], prev_end[1], s_row, s_col)
        result.append(gap)

-        if tok_type == tokenize.NAME and tok_string in mapping:
-            result.append(mapping[tok_string])
+        if tok_type == tokenize.NAME and tok_string in safe_mapping:
+            result.append(safe_mapping[tok_string])
        else:
            result.append(tok_string)
Author	SHA1	Message	Date
Keshav Anand	21107a1d43	Merge pull request 'fix/tokenizer-bug' (#1 ) from fix/tokenizer-bug into main All checks were successful CI / test (push) Successful in 11s Details Reviewed-on: #1	2026-05-21 15:32:46 +00:00
KeshavAnandCode	d4b931ec2c	bumped version All checks were successful CI / test (pull_request) Successful in 12s Details Publish Core / verify (push) Successful in 10s Details Publish Core / publish (push) Successful in 14s Details	2026-05-21 15:31:29 +00:00
KeshavAnandCode	784b3729e4	replit agent fixes for languages like tamil	2026-05-21 15:30:10 +00:00
KeshavAnandCode	e50b2754ae	added readme All checks were successful CI / test (push) Successful in 12s Details	2026-05-20 17:29:55 -05:00