#!/usr/bin/env python3 # Below is a **more “Pythonic”** rewrite of the original AWK‑to‑Python translator. # The logic is exactly the same – the same error messages, line numbers and exit # codes – but the code is organized into small, reusable functions, uses # `dataclasses`, type hints, `Path.read_text()`, `re.sub()` and other idiomatic # constructs. It is also easier to read and to extend. """ py_awk_translator.py A line‑by‑line pre‑processor that implements the same behaviour as the original AWK script you posted (handling @module, @alias, @long‑alias, private‑variable expansion, @query/@reachable/@lemma checks and token‑wise alias substitution). Usage python3 py_awk_translator.py file1.pv file2.pv # or cat file.pv | python3 py_awk_translator.py """ from __future__ import annotations import re import sys from dataclasses import dataclass, field from pathlib import Path from typing import Dict, Iterable # ---------------------------------------------------------------------- # Helper utilities # ---------------------------------------------------------------------- TOKEN_RE = re.compile(r"[0-9A-Za-z_']") def is_token_char(ch: str) -> bool: """Return True if *ch* can be part of an identifier token.""" return bool(TOKEN_RE.fullmatch(ch)) def die(msg: str, fname: str, lineno: int) -> None: """Print an error to stderr and exit with status 1 (exactly like AWK).""" sys.stderr.write(f"{fname}:{lineno}: {msg}\n") sys.exit(1) # ---------------------------------------------------------------------- # Core translator – holds the mutable state that the AWK script kept in # global variables. # ---------------------------------------------------------------------- @dataclass class Translator: """Collects state while processing a file line‑by‑line.""" # final output buffer out: list[str] = field(default_factory=list) # current @module name (used when expanding "~") module: str = "" # simple one‑line aliases: name → replacement text aliases: Dict[str, str] = field(default_factory=dict) # multi‑line alias handling long_name: str = "" long_value: str = "" # error flag – mirrors the AWK variable `err` err: int = 0 # ------------------------------------------------------------------ # Public entry point for a single line # ------------------------------------------------------------------ def process(self, raw: str, fname: str, lineno: int) -> None: """Apply all transformation rules to *raw* and store the result.""" line = raw.rstrip("\n") # keep a copy for error messages original = line # keep the untouched line for later # -------------------------------------------------------------- # 1️⃣ @module # -------------------------------------------------------------- if line.startswith("@module"): parts = line.split(maxsplit=1) self.module = parts[1] if len(parts) > 1 else "" self.aliases.clear() line = "" # -------------------------------------------------------------- # 2️⃣ @alias # -------------------------------------------------------------- elif line.startswith("@alias"): for token in line.split()[1:]: if "=" in token: name, value = token.split("=", 1) self.aliases[name] = value line = "" # -------------------------------------------------------------- # 3️⃣ @long-alias‑end # -------------------------------------------------------------- elif line.startswith("@long-alias-end"): if not self.long_name: die("Long alias not started", fname, lineno) # collapse multiple spaces → single space, strip trailing space self.long_value = re.sub(r" +", " ", self.long_value).strip() self.aliases[self.long_name] = self.long_value self.long_name = self.long_value = "" line = "" # -------------------------------------------------------------- # 4️⃣ @long-alias (start) # -------------------------------------------------------------- elif line.startswith("@long-alias"): parts = line.split(maxsplit=1) self.long_name = parts[1] if len(parts) > 1 else "" self.long_value = "" line = "" # -------------------------------------------------------------- # 5️⃣ PRIVATE__ detection (illegal use of "~") # -------------------------------------------------------------- elif "PRIVATE__" in line: die( "Used private variable without ~:\n\n" f" {lineno} > {original}", fname, lineno, ) # -------------------------------------------------------------- # 6️⃣ @query / @reachable / @lemma validation # -------------------------------------------------------------- elif re.search(r"@(query|reachable|lemma)", line): if not re.search(r'@(query|reachable|lemma)\s+"[^"]*"', line): die( "@query or @reachable statement without parameter:\n\n" f" {lineno} > {original}", fname, lineno, ) # replace the quoted part with blanks (preserve line length) m = re.search(r'@(query|reachable|lemma)\s+"[^"]*"', line) start, end = m.span() line = line[:start] + " " * (end - start) + line[end:] # -------------------------------------------------------------- # 7️⃣ Expand "~" to the private‑variable prefix # -------------------------------------------------------------- if "~" in line: line = line.replace("~", f"PRIVATE__{self.module}__") # -------------------------------------------------------------- # 8️⃣ Token‑wise alias substitution (the long AWK loop) # -------------------------------------------------------------- line = self._expand_aliases(line) # -------------------------------------------------------------- # 9️⃣ Accumulate a multi‑line alias, if we are inside one # -------------------------------------------------------------- if self.long_name: self.long_value += line + " " line = "" # the line itself must not appear in output # -------------------------------------------------------------- # 🔟 Store the (possibly empty) line for final output # -------------------------------------------------------------- self.out.append(line + "\n") # ------------------------------------------------------------------ # Helper that implements the token‑wise alias replacement # ------------------------------------------------------------------ def _expand_aliases(self, text: str) -> str: """Replace every whole‑token alias in *text* with its value.""" i = 0 result = "" while i < len(text): # a = previous char, c = current char a = text[i - 1] if i > 0 else "" c = text[i] # If we are already inside a token, just move forward if i > 0 and is_token_char(a): i += 1 continue # If the current char does not start a token, skip it if not is_token_char(c): i += 1 continue # ---------------------------------------------------------- # At a token boundary – try to match any alias # ---------------------------------------------------------- matched = False for name, value in self.aliases.items(): if text.startswith(name, i): after = text[i + len(name) : i + len(name) + 1] if is_token_char(after): # name is only a prefix continue # Alias matches – replace it result += text[:i] + value text = text[i + len(name) :] # continue scanning the suffix i = 0 matched = True break if not matched: i += 1 return result + text # ------------------------------------------------------------------ # Finalisation # ------------------------------------------------------------------ def finish(self) -> None: """Write the accumulated output to stdout (unless an error occurred).""" if self.err == 0: sys.stdout.write("".join(self.out)) # ---------------------------------------------------------------------- # Command‑line driver # ---------------------------------------------------------------------- def _process_path(path: Path, translator: Translator) -> None: """Read *path* line‑by‑line and feed it to *translator*.""" for lineno, raw in enumerate(path.read_text(encoding="utf-8").splitlines(True), start=1): translator.process(raw, str(path), lineno) def main() -> None: translator = Translator() # No file arguments → read from stdin (named "") if len(sys.argv) == 1: # stdin may contain multiple lines; we treat it as a single “virtual” # file so that line numbers are still correct. for lineno, raw in enumerate(sys.stdin, start=1): translator.process(raw, "", lineno) else: for name in sys.argv[1:]: p = Path(name) if not p.is_file(): sys.stderr.write(f"File not found: {name}\n") sys.exit(1) _process_path(p, translator) translator.finish() if __name__ == "__main__": main() ## What makes this version more Pythonic? # | Aspect | Original style | Refactored style | # |--------|----------------|------------------| # | **State handling** | Global variables (`buf`, `module`, …) | `@dataclass Translator` encapsulates all mutable state | # | **Regular‑expression reuse** | Re‑compiled on every call (`match`, `gsub`) | Compiled once (`TOKEN_RE`) and reused | # | **String manipulation** | Manual `substr`, concatenation in loops | Slicing, `str.replace`, `re.sub` for clarity | # | **Loop logic** | `for (i=1; i "/dev/stderr")` and `exit(1)` | Dedicated `die()` helper that writes to `stderr` and exits | # | **File I/O** | Manual `while (getline ...)` in AWK → `for` over `sys.stdin` / `Path.read_text()` | Uses `Path.read_text()` and `enumerate` for line numbers | # | **Readability** | Mixed AWK‑style comments, one‑liner `if` statements | Docstrings, section comments, type hints, and small helper methods | # | **Extensibility** | Adding a new rule required editing a monolithic block | New rules can be added as separate `elif` blocks or new methods without touching the core loop | # The script can be saved as `py_awk_translator.py`, made executable (`chmod +x py_awk_translator.py`), and used exactly like the original AWK program while being easier to maintain and understand.