WIP: lark parser for awk

Co-authored-by: Benjamin Lipp <blipp@rosenpass.eu>
2026-02-28 14:33:37 -08:00 · 2025-09-30 17:46:33 +02:00
parent cb16bd44bb
commit a389e3c222
3 changed files with 391 additions and 1 deletions
--- a/marzipan/README.md
+++ b/marzipan/README.md
@@ -4,5 +4,5 @@
 * `src/__init__.py` is the new script

 * call the old script from the Rosenpass repository's root directory with `./analyze.sh`
-* call the new script:
+* call the new script from the marzipan directory:
  * `nix run .# -- analyze $repo` where `$repo` is the absolute(?) path to the root directory of the Rosenpass repository.
--- a/marzipan/TODO.md
+++ b/marzipan/TODO.md
@@ -37,6 +37,7 @@
 ## Next Steps

 * integrate marzipan.awk into Python, somehow
+  * lark parser: multiline comments, how???
 * rewrite marzipan.awk into Python/LARK
  * define a LARK grammar for marzipan.awk rules
  * write python code for processing marzipan rules, e.g. alias replacement (step: i.pv->o.pv)
--- a/marzipan/parser.py
+++ b/marzipan/parser.py
@@ -0,0 +1,389 @@
+from lark import Lark, tree, exceptions, Token, Transformer
+import sys
+
+proverif_grammar = Lark(
+    grammar="""
+    PROCESS: "process"
+    start: decl* PROCESS process
+    YIELD: "yield"
+    channel: CHANNEL
+    CHANNEL: "channel"
+    """
+    + (
+        "IDENT: /(?!(among|axiom|channel|choice|clauses|const|def|diff|"
+        "do|elimtrue|else|equation|equivalence|event|expand|fail|for|forall|"
+        "foreach|free|fun|get|if|implementation|in|inj-event|insert|lemma|let|"
+        "letfun|letproba|new|noninterf|noselect|not|nounif|or|otherwise|out|"
+        "param|phase|pred|proba|process|proof|public vars|putbegin|query|reduc|"
+        "restriction|secret|select|set|suchthat|sync|table|then|type|weaksecret|"
+        "yield))[a-zA-Z][a-zA-Z0-9À-ÿ'_]*/"
+    )
+    + """
+    ZERO: "0"
+    INFIX: "||"
+         | "&&"
+         | "="
+         | "<>"
+         | "<="
+         | ">="
+         | "<"
+         | ">"
+    typeid: channel
+          | IDENT
+    _non_empty_seq{x}: x ("," x)*
+    _maybe_empty_seq{x}: [ _non_empty_seq{x} ]
+
+    options: [ "[" _non_empty_seq{IDENT} "]" ]
+    process: ZERO
+           | YIELD
+           | IDENT [ "(" _maybe_empty_seq{pterm} ")" ]
+           | bracketed_process
+           | piped_process
+           | replicated_process
+           | replicated_process_bounds
+           | sample_process
+           | if_process
+           | in_process
+           | out_process
+           | let_process
+           | insert_process
+           | get_process
+           | event_process
+           | phase
+           | sync
+    bracketed_process: "(" process ")"
+    piped_process: process "|" process
+    replicated_process: "!" process
+    replicated_process_bounds: "!" IDENT "<=" IDENT process
+                             | "foreach" IDENT "<=" IDENT "do" process
+    sample_process: "new" IDENT [ "[" _maybe_empty_seq{IDENT} "]" ] ":" typeid [";" process]
+                  | IDENT "<-R" typeid [";" process]
+    let_process: "let" pattern "=" pterm ["in" pterm [ "else" pterm ]]
+               | IDENT [":" typeid] "<-" pterm [";" process]
+               | "let" typedecl "suchthat" pterm options [ "in" process [ "else" process ] ]
+    if_process: "if" pterm "then" process [ "else" process ]
+    in_process: "(" pterm "," pattern ")" options [ ";" process ]
+    get_process: IDENT "(" _maybe_empty_seq{pattern} ")" [ "suchthat" pterm ] options [ "in" process [ "else" process ] ]
+    out_process: "out" "(" pterm "," pterm ")" [ ";" process ]
+    insert_process: "insert" IDENT "(" _maybe_empty_seq{pterm} ")" [ ";" process ]
+    event_process: "event" IDENT [ "(" _maybe_empty_seq{pterm} ")" ] [ ";" process ]
+    term: IDENT
+          | NAT
+          | "(" _maybe_empty_seq{term} ")"
+          | IDENT "(" _maybe_empty_seq{term} ")"
+          | term ( "+" | "-" ) NAT
+          | NAT "+" term
+          | term INFIX term
+          | "not" "(" term ")"
+
+    query: gterm ["public_vars" _non_empty_seq{IDENT}] [";" query]
+         | "secret" IDENT ["public_vars" _non_empty_seq{IDENT}] options [";" query]
+         | "putbegin" "event" ":" _non_empty_seq{IDENT} [";" query] // Opportunistically left a space between "event" and ":", ProVerif might not accept it with spaces.
+         | "putbegin" "inj-event" ":" _non_empty_seq{IDENT} [";" query]
+    lemma: gterm [";" lemma]
+         | gterm "for" "{" "public_vars" _non_empty_seq{IDENT} "}" [";" lemma]
+         | gterm "for" "{" "secret" IDENT [ "public_vars" _non_empty_seq{IDENT}] "[real_or_random]" "}" [";" lemma]
+    gterm: ident_gterm
+         | fun_gterm
+         | choice_gterm
+         | infix_gterm
+         | arith_gterm
+         | arith2_gterm
+         | event_gterm
+         | injevent_gterm
+         | implies_gterm
+         | paren_gterm
+         | sample_gterm
+         | let_gterm
+    ident_gterm: IDENT
+    fun_gterm: IDENT "(" _maybe_empty_seq{gterm} ")" ["phase" NAT] ["@" IDENT]
+    choice_gterm: "choice" "[" gterm "," gterm "]"
+    infix_gterm: gterm INFIX gterm
+    arith_gterm: gterm ( "+" | "-" ) NAT
+    arith2_gterm: NAT "+" gterm
+    event_gterm: "event" "(" _maybe_empty_seq{gterm} ")" ["@" IDENT]
+    injevent_gterm: "inj-event" "(" _maybe_empty_seq{gterm} ")" ["@" IDENT]
+    implies_gterm: gterm "==>" gterm
+    paren_gterm: "(" _maybe_empty_seq{gterm} ")"
+    sample_gterm: "new" IDENT [ "[" [ gbinding ] "]" ]
+    let_gterm: "let" IDENT "=" gterm "in" gterm
+
+    gbinding: "!" NAT "=" gterm [";" gbinding]
+            | IDENT "=" gterm [";" gbinding]
+
+    nounifdecl: "let" IDENT "=" gformat "in" nounifdecl
+              | IDENT ["(" _maybe_empty_seq{gformat} ")" ["phase" NAT]]
+    gformat: IDENT
+           | "*" IDENT
+           | IDENT "(" _maybe_empty_seq{gformat} ")"
+           | "choice" "[" gformat "," gformat "]"
+           | "not" "(" _maybe_empty_seq{gformat} ")"
+           | "new" IDENT [ "[" [ fbinding ] "]" ]
+           | "let" IDENT "=" gformat "in" gformat
+    fbinding: "!" NAT "=" gformat [";" fbinding]
+            | IDENT "=" gformat [";" fbinding]
+    nounifoption: "hypothesis"
+                | "conclusion"
+                | "ignoreAFewTimes"
+                | "inductionOn" "=" IDENT
+                | "inductionOn" "=" "{" _non_empty_seq{IDENT} "}"
+
+    pterm: IDENT
+          | "(" _maybe_empty_seq{pterm} ")"
+          | IDENT "(" _maybe_empty_seq{pterm} ")"
+          | choice_pterm
+          | pterm ("+" | "-") NAT
+          | NAT "+" pterm
+          | pterm INFIX pterm
+          | not_pterm
+          | sample_pterm
+          | if_pterm
+          | let_pterm
+          | insert_pterm
+          | get_pterm
+          | event_pterm
+    choice_pterm: "choice[" pterm "," pterm "]"
+    if_pterm: "if" pterm "then" pterm [ "else" pterm ]
+    not_pterm: "not" "(" pterm ")"
+    let_pterm: "let" pattern "=" pterm "in" pterm [ "else" pterm ]
+             | IDENT [":" typeid] "<-" pterm ";" pterm
+             | "let" typedecl "suchthat" pterm "in" pterm [ "else" pterm ]
+    sample_pterm: "new" IDENT [ "[" _maybe_empty_seq{IDENT} "]" ] ":" typeid [";" pterm]
+                | IDENT "<-R" typeid [";" pterm]
+    insert_pterm: "insert" IDENT "(" _maybe_empty_seq{pterm} ")" ";" pterm
+    event_pterm: "event" IDENT [ "(" _maybe_empty_seq{pterm} ")" ] ";" pterm
+    get_pterm: IDENT "(" _maybe_empty_seq{pattern} ")" [ "suchthat" pterm ] options [ "in" pterm [ "else" pterm ] ]
+    pattern: IDENT [":" typeid]
+           | "_" [ ":" typeid ]
+           | NAT
+           | pattern "+" NAT
+           | NAT "+" pattern
+           | "(" _maybe_empty_seq{pattern} ")"
+           | IDENT "(" _maybe_empty_seq{pattern} ")"
+           | "=" pterm
+    mayfailterm: term
+               | "fail"
+    mayfailterm_seq: "(" _non_empty_seq{mayfailterm} ")"
+    typedecl: _non_empty_seq{IDENT} ":" typeid [ "," typedecl ]
+    failtypedecl: _non_empty_seq{IDENT} ":" typeid [ "or fail" ] [ "," failtypedecl ]
+
+    decl: type_decl
+        | channel_decl
+        | free_decl
+        | const_decl
+        | fun_decl
+        | letfun_decl
+        | reduc_decl
+        | fun_reduc_decl
+        | equation_decl
+        | pred_decl
+        | table_decl
+        | let_decl
+        | set_settings_decl
+        | event_decl
+        | query_decl
+        | axiom_decl
+        | restriction_decl
+        | lemma_decl
+        | noninterf_decl
+        | weaksecret_decl
+        | not_decl
+        | select_decl
+        | noselect_decl
+        | nounif_decl
+        | elimtrue_decl
+        | clauses_decl
+        #| param_decl
+        #| proba_decl
+        #| letproba_decl
+        #| proof_decl
+        #| def_decl
+        #| expand_decl
+
+    type_decl: "type" IDENT options "."
+    channel_decl: "channel" _non_empty_seq{IDENT} "."
+    free_decl: "free" _non_empty_seq{IDENT} ":" typeid options "."
+    const_decl: "const" _non_empty_seq{IDENT} ":" typeid options "."
+    fun_decl: "fun" IDENT "(" _maybe_empty_seq{typeid} ")" ":" typeid options "."
+    letfun_decl: "letfun" IDENT [ "(" [ typedecl ] ")" ] "=" pterm "."
+    reduc_decl: "reduc" eqlist options "."
+    fun_reduc_decl: "fun" IDENT "(" _maybe_empty_seq{typeid} ")" ":" typeid "reduc" mayfailreduc options "."
+    equation_decl: "equation" eqlist options "."
+    pred_decl: "pred" IDENT [ "(" [ _maybe_empty_seq{typeid} ] ")" ] options "."
+    table_decl: IDENT "(" _maybe_empty_seq{typeid} ")" "."
+    let_decl: "let" IDENT [ "(" [ typedecl ] ")" ] "=" process "."
+
+    BOOL : "true" | "false"
+    NONE: "none"
+    FULL: "full"
+    ALL: "all"
+    FUNC: IDENT
+    ignoretype_options: BOOL | ALL | NONE | "attacker"
+    boolean_settings_names: "privateCommOnPublicTerms"
+                            | "rejectChoiceTrueFalse"
+                            | "rejectNoSimplif"
+                            | "allowDiffPatterns"
+                            | "inductionQueries"
+                            | "inductionLemmas"
+                            | "movenew"
+                            | "movelet"
+                            | "stopTerm"
+                            | "removeEventsForLemma"
+                            | "simpEqAll"
+                            | "eqInNames"
+                            | "preciseLetExpand"
+                            | "expandSimplifyIfCst"
+                            | "featureFuns"
+                            | "featureNames"
+                            | "featurePredicates"
+                            | "featureEvents"
+                            | "featureTables"
+                            | "featureDepth"
+                            | "featureWidth"
+                            | "simplifyDerivation"
+                            | "abbreviateDerivation"
+                            | "explainDerivation"
+                            | "unifyDerivation"
+                            | "reconstructDerivation"
+                            | "displayDerivation"
+                            | "traceBacktracking"
+                            | "interactiveSwapping"
+                            | "color"
+                            | "verboseLemmas"
+                            | "abbreviateClauses"
+                            | "removeUselessClausesBeforeDisplay"
+                            | "verboseEq"
+                            | "verboseDestructors"
+                            | "verboseTerm"
+                            | "verboseStatistics"
+                            | "verboseRules"
+                            | "verboseBase"
+                            | "verboseRedundant"
+                            | "verboseCompleted"
+                            | "verboseGoalReachable"
+
+    _decl_pair{name, value}: "set" name "=" value "."
+
+    set_settings_boolean_decl: _decl_pair{boolean_settings_names, BOOL}
+
+    ignore_types_values: BOOL | "all" | "none" | "attacker"
+    simplify_process_values: BOOL | "interactive"
+    precise_actions_values: BOOL | "trueWithoutArgsInNames"
+    redundant_hyp_elim_values: BOOL | "beginOnly"
+    reconstruct_trace_values: BOOL | "n"
+    attacker_values: "active" | "passive"
+    key_compromise_values: "none" | "approx" | "strict"
+    predicates_implementable: "check" | "nocheck"
+    application_values: "instantiate" | "full" | "none" | "discard"
+    max_values: "none" | "n"
+    sel_fun_values: "TermMaxsize" | "Term"| "NounifsetMaxsize" | "Nounifset"
+    redundancy_elim_values: "best" | "simple" | "no"
+    nounif_ignore_a_few_times_values: "none" | "auto" | "all"
+    nounif_ignore_ntimes_values: "n"
+    trace_display_values: "short" | "long" | "none"
+    verbose_clauses_values: "none" | "explained" | "short"
+    set_settings_decl: set_settings_boolean_decl
+                | _decl_pair{"ignoreTypes", ignore_types_values}
+                | _decl_pair{"simplifyProcess", simplify_process_values}
+                | _decl_pair{"preciseActions", precise_actions_values}
+                | _decl_pair{"redundantHypElim", redundant_hyp_elim_values}
+                | _decl_pair{"reconstructTrace", reconstruct_trace_values}
+                | _decl_pair{"attacker", attacker_values}
+                | _decl_pair{"keyCompromise", key_compromise_values}
+                | _decl_pair{"predicatesImplementable", predicates_implementable}
+                | _decl_pair{"saturationApplication", application_values}
+                | _decl_pair{"verificationApplication", application_values}
+                | _decl_pair{"maxDepth", max_values}
+                | _decl_pair{"maxHyp", max_values}
+                | _decl_pair{"selFun", sel_fun_values}
+                | _decl_pair{"redundancyElim", redundancy_elim_values}
+                | _decl_pair{"nounifIgnoreAFewTimes", nounif_ignore_a_few_times_values}
+                | _decl_pair{"nounifIgnoreNtimes", nounif_ignore_ntimes_values}
+                | _decl_pair{"traceDisplay", trace_display_values}
+                | _decl_pair{"verboseClauses", verbose_clauses_values}
+                | set_strategy
+                | set_symb_order
+
+    _swap_strategy_seq{x}: x ("->" x)*
+    set_strategy: "set" "swapping" "=" _swap_strategy_seq{TAG} "."
+    _symb_ord_seq{x}: x (">" x)*
+    set_symb_order: "set" "symbOrder" "=" _symb_ord_seq{FUNC} "."
+
+    event_decl: IDENT ["(" _maybe_empty_seq{typeid} ")"] "."
+    query_decl: "query" [ typedecl ";"] query options "."
+
+    axiom_decl:       "axiom"       [ typedecl ";"] lemma options "."
+    restriction_decl: "restriction" [ typedecl ";"] lemma options "."
+    lemma_decl:       "lemma"       [ typedecl ";"] lemma options "."
+
+    noninterf_decl: [ typedecl ";"] _maybe_empty_seq{nidecl} "."
+    weaksecret_decl: "weaksecret" IDENT "."
+    not_decl: "not" [ typedecl ";"] gterm "."
+
+    INT: NAT | "-" NAT
+    select_decl:    "select"    [ typedecl ";"] nounifdecl [ "/" INT ] [ "[" _non_empty_seq{nounifoption} "]" ] "."
+    noselect_decl:  "noselect"  [ typedecl ";"] nounifdecl [ "/" INT ] [ "[" _non_empty_seq{nounifoption} "]" ] "."
+    nounif_decl:    "nounif"    [ typedecl ";"] nounifdecl [ "/" INT ] [ "["_non_empty_seq{nounifoption} "]" ] "."
+
+    elimtrue_decl: "elimtrue" [ failtypedecl ";" ] term "."
+    clauses_decl: "clauses" clauses "."
+
+    # TODO: finish defining these (comes from Cryptoverif)
+    #param_decl: "param" _non_empty_seq{IDENT} options "."
+    #proba_decl: "proba" IDENT ["(...)"] options "."
+    #letproba_decl: "letproba" IDENT ["(...)"] "= ..." "."
+    #proof_decl: "proof" "{" proof "}"
+    #def_decl: "def" IDENT "(" _maybe_empty_seq{typeid} ")" "{" decl* "}"
+    #expand_decl: "expand" IDENT "(" _maybe_empty_seq{typeid} ")" "."
+
+    nidecl: IDENT [ "among" "(" _non_empty_seq{term} ")" ]
+    equality: term "=" term
+            | "let" IDENT "=" term "in" equality
+    mayfailequality: IDENT mayfailterm_seq "=" mayfailterm
+    eqlist: [ "forall" typedecl ";" ] equality [ ";" eqlist ]
+    clause: term
+           | term "->" term
+           | term "<->" term
+           | term "<=>" term
+    clauses: [ "forall" failtypedecl ";" ] clause [ ";" clauses ]
+    mayfailreduc: [ "forall" failtypedecl ";" ] mayfailequality [ "otherwise" mayfailreduc ]
+    NAT: DIGIT+
+    phase: "phase" NAT [";" process]
+    TAG: IDENT
+    sync: "sync" NAT ["[" TAG "]"] [";" process]
+    COMMENT:  "(*" /.*/
+    %import common (WORD, DIGIT, NUMBER, WS) // imports from terminal library
+    %ignore WS // Disregard spaces in text
+    %ignore COMMENT
+""",
+    debug=True,
+    # lexer_callbacks={"COMMENT": comments.append},
+)
+
+# TODO Open ProVerif compatibility questions
+# TODO * does it allow leading zeros for NAT?
+# TODO * tag is not defined? is it ident?
+# TODO * are spaces between "event" and ":" allowed?
+# TODO * spaces between "nat" and "("? "choice" and "["?
+
+
+def parsertest(input):
+    parsetree = proverif_grammar.parse(input)
+    # tree.pydot__tree_to_png(parsetree, name + ".png")
+    return parsetree
+
+
+def main():
+    if len(sys.argv) != 2:
+        print(f"Usage: {sys.argv[0]} <filename>")
+        sys.exit(1)
+    filename = sys.argv[1]
+
+    with open(filename, "r") as f:
+        content = f.read()
+        # print(content)
+        parsertest(content)
+
+
+if __name__ == "__main__":
+    main()