fix spacing: verbatim inter-token copy, fix postfix decompile, add integration tests

2026-05-16 18:27:56 -05:00
parent 8f7d926a17
commit 6cade887fb
3 changed files with 523 additions and 108 deletions
--- a/src/foreignthon/transpiler.py
+++ b/src/foreignthon/transpiler.py
@@ -37,10 +37,6 @@ def _apply_postfix_syntax(source: str, mapping: dict) -> str:


 def _apply_postfix_output(source: str, en_to_foreign: dict, postfix_english: set) -> str:
-    """
-    Post-pass for decompile: rewrite foreign keyword lines to @@ postfix.
-    postfix_english comes from the language pack's postfix_keywords list.
-    """
    postfix_foreign = {en_to_foreign[k] for k in postfix_english if k in en_to_foreign}

    lines = source.splitlines(keepends=True)
@@ -66,33 +62,46 @@ def _apply_postfix_output(source: str, en_to_foreign: dict, postfix_english: set
    return "".join(result)


-def transpile(source: str, lang_code: str) -> str:
-    pack = load_pack(lang_code)
+def _get_slice(source_lines: list[str], sr: int, sc: int, er: int, ec: int) -> str:
+    """Extract text from source between two (row, col) positions (1-indexed rows)."""
+    n = len(source_lines)
+    if sr > n:
+        return ""
+    if sr == er:
+        line = source_lines[sr - 1]
+        return line[sc:min(ec, len(line))]
+    parts = []
+    parts.append(source_lines[sr - 1][sc:])
+    for r in range(sr, er - 1):
+        if r < n:
+            parts.append(source_lines[r])
+    if er <= n:
+        parts.append(source_lines[er - 1][:ec])
+    return "".join(parts)

-    mapping: dict[str, str] = {}
-    mapping.update(pack["keywords"])
-    mapping.update(pack["builtins"])
-    mapping.update(pack["exceptions"])
-    mapping.update(pack["stdlib"])

-    source = _apply_postfix_syntax(source, mapping)
+def _swap_tokens(source: str, mapping: dict) -> str:
+    """
+    Swap NAME tokens while copying all inter-token text verbatim from source.
+    This preserves original spacing exactly — no double newlines, no extra spaces.
+    """
+    source_lines = source.splitlines(keepends=True)
+    tokens = list(tokenize.generate_tokens(io.StringIO(source).readline))

-    tokens_in = tokenize.generate_tokens(io.StringIO(source).readline)
-    result: list[str] = []
+    result = []
    prev_end = (1, 0)

-    for tok in tokens_in:
-        tok_type, tok_string, tok_start, tok_end, _ = tok
+    for tok_type, tok_string, tok_start, tok_end, _ in tokens:
+        if tok_type in (tokenize.ENDMARKER, tokenize.ENCODING):
+            break

-        start_row, start_col = tok_start
-        end_row, end_col = prev_end
+        s_row, s_col = tok_start

-        if start_row == end_row:
-            result.append(" " * (start_col - end_col))
-        else:
-            result.append("\n" * (start_row - end_row))
-            result.append(" " * start_col)
+        # Copy original whitespace/newlines between tokens verbatim
+        gap = _get_slice(source_lines, prev_end[0], prev_end[1], s_row, s_col)
+        result.append(gap)

+        # Swap or keep token
        if tok_type == tokenize.NAME and tok_string in mapping:
            result.append(mapping[tok_string])
        else:
@@ -103,6 +112,19 @@ def transpile(source: str, lang_code: str) -> str:
    return "".join(result)


+def transpile(source: str, lang_code: str) -> str:
+    pack = load_pack(lang_code)
+
+    mapping: dict[str, str] = {}
+    mapping.update(pack["keywords"])
+    mapping.update(pack["builtins"])
+    mapping.update(pack["exceptions"])
+    mapping.update(pack["stdlib"])
+
+    source = _apply_postfix_syntax(source, mapping)
+    return _swap_tokens(source, mapping)
+
+
 def detranspile(source: str, lang_code: str, postfix: bool = False) -> str:
    pack = load_pack(lang_code)

@@ -111,33 +133,9 @@ def detranspile(source: str, lang_code: str, postfix: bool = False) -> str:
        for foreign, english in pack[section].items():
            en_to_foreign[english] = foreign

-    tokens_in = tokenize.generate_tokens(io.StringIO(source).readline)
-    result: list[str] = []
-    prev_end = (1, 0)
-
-    for tok in tokens_in:
-        tok_type, tok_string, tok_start, tok_end, _ = tok
-
-        start_row, start_col = tok_start
-        end_row, end_col = prev_end
-
-        if start_row == end_row:
-            result.append(" " * (start_col - end_col))
-        else:
-            result.append("\n" * (start_row - end_row))
-            result.append(" " * start_col)
-
-        if tok_type == tokenize.NAME and tok_string in en_to_foreign:
-            result.append(en_to_foreign[tok_string])
-        else:
-            result.append(tok_string)
-
-        prev_end = tok_end
-
-    output = "".join(result)
+    output = _swap_tokens(source, en_to_foreign)

    if postfix:
-        # Use pack-defined list, fallback to sensible defaults
        postfix_english = set(pack.get("postfix_keywords", ["if", "elif", "while"]))
        output = _apply_postfix_output(output, en_to_foreign, postfix_english)