From 5097d9fce1dbcb109883a417afcbcd2cc3e8b6bb Mon Sep 17 00:00:00 2001
From: "Jan Winkelmann (keks)" <jan@cryspen.com>
Date: Mon, 14 Apr 2025 18:13:13 +0200
Subject: [PATCH 01/10] Add benchmarking for cryptographic primitives and
 protocol performance

This commit introduces two kinds of benchmarks:

1. Cryptographic Primitives. Measures the performance of all available
   implementations of cryptographic algorithms using traditional
   benchmarking. Uses criterion.
2. Protocol Runs. Measures the time each step in the protocol takes.
   Measured using a tracing-based approach.

The benchmarks are run on CI and an interactive visual overview is
written to the gh-pages branch. If a benchmark takes more than twice the
time than the reference commit (for PR: the main branch), the action
fails.
---
 .github/workflows/bench-primitives.yml | 106 +++++++
 .github/workflows/bench-protocol.yml   |  96 +++++++
 Cargo.lock                             |  29 +-
 Cargo.toml                             |   2 +
 ciphers/Cargo.toml                     |  14 +
 ciphers/benches/primitives.rs          | 346 +++++++++++++++++++++++
 ciphers/src/subtle/libcrux/mod.rs      |   6 +-
 ciphers/src/subtle/mod.rs              |   3 +-
 flake.nix                              |  10 +
 pkgs/rosenpass.nix                     |   1 +
 readme.md                              |  42 ++-
 rosenpass/Cargo.toml                   |   7 +
 rosenpass/benches/trace_handshake.rs   | 371 +++++++++++++++++++++++++
 rosenpass/src/protocol/protocol.rs     | 259 +++++++++++------
 util/Cargo.toml                        |   3 +
 util/src/lib.rs                        |   3 +
 util/src/trace_bench.rs                |  19 ++
 17 files changed, 1225 insertions(+), 92 deletions(-)
 create mode 100644 .github/workflows/bench-primitives.yml
 create mode 100644 .github/workflows/bench-protocol.yml
 create mode 100644 ciphers/benches/primitives.rs
 create mode 100644 rosenpass/benches/trace_handshake.rs
 create mode 100644 util/src/trace_bench.rs

diff --git a/.github/workflows/bench-primitives.yml b/.github/workflows/bench-primitives.yml
new file mode 100644
index 0000000..c2a6234
--- /dev/null
+++ b/.github/workflows/bench-primitives.yml
@@ -0,0 +1,106 @@
+name: rosenpass-ciphers - primitives - benchmark
+
+on:
+  pull_request:
+  push:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  prim-benchmark:
+    strategy:
+      fail-fast: true
+      matrix:
+        system: ["x86_64-linux", "i686-linux"]
+
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - uses: actions/checkout@v4
+
+      # Install nix
+
+      - name: Install Nix
+        uses: cachix/install-nix-action@v27 # A popular action for installing Nix
+        with:
+          extra_nix_config: |
+            experimental-features = nix-command flakes
+            access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
+
+      # Set up environment
+
+      - name: 🛠️ Config Linux x64
+        run: echo "RUST_TARGET_FLAG=--target=x86_64-unknown-linux-gnu"  > $GITHUB_ENV
+        if: ${{ matrix.system == 'x86_64-linux' }}
+
+      - name: 🛠️ Config Linux x86
+        run: echo "RUST_TARGET_FLAG=--target=i686-unknown-linux-gnu"    > $GITHUB_ENV
+        if: ${{ matrix.system == 'i686-linux' }}
+
+      - name: 🛠️ Prepare Benchmark Path
+        env:
+          EVENT_NAME:  ${{ github.event_name }}
+          BRANCH_NAME: ${{ github.ref_name   }}
+          PR_NUMBER:   ${{ github.event.pull_request.number }}
+        run: |
+          case "$EVENT_NAME" in
+          "push")
+            echo "BENCH_PATH=branch/$BRANCH_NAME" >> $GITHUB_ENV
+            ;;
+          "pull_request")
+            echo "BENCH_PATH=pull/$PR_NUMBER" >> $GITHUB_ENV
+            ;;
+          *)
+            echo "don't know benchmark path for event of type $EVENT_NAME, aborting"
+            exit 1
+          esac
+
+      # Benchmarks ...
+
+      - name: 🏃🏻‍♀️ Benchmarks (using Nix as shell)
+        working-directory: ciphers
+        run: nix develop ".#devShells.${{ matrix.system }}.benchmark" --command cargo bench -F bench --bench primitives --verbose $RUST_TARGET_FLAG -- --output-format bencher | tee ../bench-primitives.txt
+
+      - name: Extract benchmarks
+        uses: cryspen/benchmark-data-extract-transform@v2
+        with:
+          name: rosenpass-ciphers primitives benchmarks
+          tool: "cargo"
+          os: ${{ matrix.system }}
+          output-file-path: bench-primitives.txt
+          data-out-path: bench-primitives.json
+
+      - name: Upload benchmarks
+        uses: cryspen/benchmark-upload-and-plot-action@v3
+        with:
+          name: Crypto Primitives Benchmarks
+          group-by: "os,primitive,algorithm"
+          schema: "os,primitive,algorithm,implementation,operation,length"
+          input-data-path: bench-primitives.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          # NOTE: pushes to current repository
+          gh-repository: github.com/${{ github.repository }}
+          # use the default (gh-pages) for the demo
+          #gh-pages-branch: benchmarks
+          auto-push: true
+          fail-on-alert: true
+
+  ciphers-primitives-bench-status:
+    if: ${{ always() }}
+    needs: [prim-benchmark]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Successful
+        if: ${{ !(contains(needs.*.result, 'failure')) }}
+        run: exit 0
+      - name: Failing
+        if: ${{ (contains(needs.*.result, 'failure')) }}
+        run: exit 1
diff --git a/.github/workflows/bench-protocol.yml b/.github/workflows/bench-protocol.yml
new file mode 100644
index 0000000..881d12d
--- /dev/null
+++ b/.github/workflows/bench-protocol.yml
@@ -0,0 +1,96 @@
+name: rosenpass - protocol - benchmark
+
+on:
+  pull_request:
+  push:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  proto-benchmark:
+    strategy:
+      fail-fast: true
+      matrix:
+        system: ["x86_64-linux", "i686-linux"]
+
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - uses: actions/checkout@v4
+
+      # Install nix
+
+      - name: Install Nix
+        uses: cachix/install-nix-action@v27 # A popular action for installing Nix
+        with:
+          extra_nix_config: |
+            experimental-features = nix-command flakes
+            access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
+
+      # Set up environment
+
+      - name: 🛠️ Config Linux x64
+        run: echo "RUST_TARGET_FLAG=--target=x86_64-unknown-linux-gnu"  > $GITHUB_ENV
+        if: ${{ matrix.system == 'x86_64-linux' }}
+
+      - name: 🛠️ Config Linux x86
+        run: echo "RUST_TARGET_FLAG=--target=i686-unknown-linux-gnu"    > $GITHUB_ENV
+        if: ${{ matrix.system == 'i686-linux' }}
+
+      - name: 🛠️ Prepare Benchmark Path
+        env:
+          EVENT_NAME:  ${{ github.event_name }}
+          BRANCH_NAME: ${{ github.ref_name   }}
+          PR_NUMBER:   ${{ github.event.pull_request.number }}
+        run: |
+          case "$EVENT_NAME" in
+          "push")
+            echo "BENCH_PATH=branch/$BRANCH_NAME" >> $GITHUB_ENV
+            ;;
+          "pull_request")
+            echo "BENCH_PATH=pull/$PR_NUMBER" >> $GITHUB_ENV
+            ;;
+          *)
+            echo "don't know benchmark path for event of type $EVENT_NAME, aborting"
+            exit 1
+          esac
+
+      # Benchmarks ...
+
+      - name: 🏃🏻‍♀️ Benchmarks
+        run: nix develop ".#devShells.${{ matrix.system }}.benchmark" --command cargo bench -p rosenpass --bench trace_handshake -F trace_bench --verbose $RUST_TARGET_FLAG >bench-protocol.json
+
+      - name: Upload benchmarks
+        uses: cryspen/benchmark-upload-and-plot-action@v3
+        with:
+          name: Protocol Benchmarks
+          group-by: "os,arch,protocol version,run time"
+          schema: "os,arch,protocol version,run time,name"
+          input-data-path: bench-protocol.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          # NOTE: pushes to current repository
+          gh-repository: github.com/${{ github.repository }}
+          # use the default (gh-pages) for the demo
+          #gh-pages-branch: benchmarks
+          auto-push: true
+          fail-on-alert: true
+
+  ciphers-protocol-bench-status:
+    if: ${{ always() }}
+    needs: [proto-benchmark]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Successful
+        if: ${{ !(contains(needs.*.result, 'failure')) }}
+        run: exit 0
+      - name: Failing
+        if: ${{ (contains(needs.*.result, 'failure')) }}
+        run: exit 1
diff --git a/Cargo.lock b/Cargo.lock
index 98e590d..b356525 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1269,7 +1269,7 @@ version = "0.0.3-pre"
 source = "git+https://github.com/cryspen/libcrux.git?rev=10ce653e9476#10ce653e94761352b657b6cecdcc0c85675813df"
 dependencies = [
  "libcrux-hacl-rs",
- "libcrux-macros",
+ "libcrux-macros 0.0.2",
 ]
 
 [[package]]
@@ -1279,7 +1279,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78d522fb626847390ea4b776c7eca179ecec363c6c4730b61b0c0feb797b8d92"
 dependencies = [
  "libcrux-hacl-rs",
- "libcrux-macros",
+ "libcrux-macros 0.0.2",
  "libcrux-poly1305",
 ]
 
@@ -1299,7 +1299,7 @@ version = "0.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8bba0885296a72555a5d77056c39cc9b04edd9ab1afa3025ef3dbd96220705c"
 dependencies = [
- "libcrux-macros",
+ "libcrux-macros 0.0.2",
 ]
 
 [[package]]
@@ -1321,6 +1321,15 @@ dependencies = [
  "syn 2.0.98",
 ]
 
+[[package]]
+name = "libcrux-macros"
+version = "0.0.3"
+source = "git+https://github.com/cryspen/libcrux.git?rev=0ab6d2dd9c1f#0ab6d2dd9c1f39c82b1125a566d6befb38feea28"
+dependencies = [
+ "quote",
+ "syn 2.0.98",
+]
+
 [[package]]
 name = "libcrux-ml-kem"
 version = "0.0.2-beta.3"
@@ -1350,7 +1359,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "80143d78ae14ab51ceb2c8a9514fb60af6645d42a9c951bc511792c19c974fca"
 dependencies = [
  "libcrux-hacl-rs",
- "libcrux-macros",
+ "libcrux-macros 0.0.2",
 ]
 
 [[package]]
@@ -1364,6 +1373,14 @@ dependencies = [
  "libcrux-platform",
 ]
 
+[[package]]
+name = "libcrux-test-utils"
+version = "0.0.2"
+source = "git+https://github.com/cryspen/libcrux.git?rev=0ab6d2dd9c1f#0ab6d2dd9c1f39c82b1125a566d6befb38feea28"
+dependencies = [
+ "libcrux-macros 0.0.3",
+]
+
 [[package]]
 name = "libfuzzer-sys"
 version = "0.4.9"
@@ -2024,6 +2041,7 @@ dependencies = [
  "hex",
  "hex-literal",
  "home",
+ "libcrux-test-utils",
  "log",
  "memoffset 0.9.1",
  "mio",
@@ -2070,6 +2088,7 @@ dependencies = [
  "anyhow",
  "blake2",
  "chacha20poly1305",
+ "criterion",
  "libcrux",
  "libcrux-blake2",
  "libcrux-chacha20poly1305",
@@ -2153,6 +2172,8 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "base64ct",
+ "lazy_static",
+ "libcrux-test-utils",
  "mio",
  "rustix",
  "static_assertions",
diff --git a/Cargo.toml b/Cargo.toml
index 8583095..368b891 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -73,12 +73,14 @@ libcrux = { version = "0.0.2-pre.2" }
 libcrux-chacha20poly1305 = { version = "0.0.2-beta.3" }
 libcrux-ml-kem = { version = "0.0.2-beta.3" }
 libcrux-blake2 = { git = "https://github.com/cryspen/libcrux.git", rev = "10ce653e9476" }
+libcrux-test-utils = { git = "https://github.com/cryspen/libcrux.git", rev = "0ab6d2dd9c1f" }
 hex-literal = { version = "0.4.1" }
 hex = { version = "0.4.3" }
 heck = { version = "0.5.0" }
 libc = { version = "0.2" }
 uds = { git = "https://github.com/rosenpass/uds" }
 signal-hook = "0.3.17"
+lazy_static = "1.5"
 
 #Dev dependencies
 serial_test = "3.2.0"
diff --git a/ciphers/Cargo.toml b/ciphers/Cargo.toml
index 2e8e5c7..080deeb 100644
--- a/ciphers/Cargo.toml
+++ b/ciphers/Cargo.toml
@@ -24,6 +24,19 @@ experiment_libcrux_chachapoly_test = [
   "dep:libcrux",
 ]
 experiment_libcrux_kyber = ["dep:libcrux-ml-kem", "dep:rand"]
+bench = [
+    "dep:thiserror",
+    "dep:rand",
+    "dep:libcrux",
+    "dep:libcrux-blake2",
+    "dep:libcrux-ml-kem",
+    "dep:libcrux-chacha20poly1305",
+]
+
+[[bench]]
+name = "primitives"
+harness = false
+required-features = ["bench"]
 
 [dependencies]
 anyhow = { workspace = true }
@@ -50,3 +63,4 @@ libcrux = { workspace = true, optional = true }
 
 [dev-dependencies]
 rand = { workspace = true }
+criterion = { workspace = true }
diff --git a/ciphers/benches/primitives.rs b/ciphers/benches/primitives.rs
new file mode 100644
index 0000000..eb81c66
--- /dev/null
+++ b/ciphers/benches/primitives.rs
@@ -0,0 +1,346 @@
+criterion::criterion_main!(keyed_hash::benches, aead::benches, kem::benches);
+
+fn benchid(base: KvPairs, last: KvPairs) -> String {
+    format!("{base},{last}")
+}
+
+#[derive(Clone, Copy, Debug)]
+struct KvPair<'a>(&'a str, &'a str);
+
+impl std::fmt::Display for KvPair<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{k}={v}", k = self.0, v = self.1)
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+struct KvPairs<'a>(&'a [KvPair<'a>]);
+
+impl std::fmt::Display for KvPairs<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self.0.len() {
+            0 => Ok(()),
+            1 => write!(f, "{}", &self.0[0]),
+            _ => {
+                let mut delim = "";
+                for pair in self.0 {
+                    write!(f, "{delim}{pair}")?;
+                    delim = ",";
+                }
+                Ok(())
+            }
+        }
+    }
+}
+
+mod kem {
+    criterion::criterion_group!(
+        benches,
+        bench_kyber512_libcrux,
+        bench_kyber512_oqs,
+        bench_classicmceliece460896_oqs
+    );
+
+    use criterion::Criterion;
+
+    fn bench_classicmceliece460896_oqs(c: &mut Criterion) {
+        template(
+            c,
+            "classicmceliece460896",
+            "oqs",
+            rosenpass_oqs::ClassicMceliece460896,
+        );
+    }
+
+    fn bench_kyber512_libcrux(c: &mut Criterion) {
+        template(
+            c,
+            "kyber512",
+            "libcrux",
+            rosenpass_ciphers::subtle::libcrux::kyber512::Kyber512,
+        );
+    }
+
+    fn bench_kyber512_oqs(c: &mut Criterion) {
+        template(c, "kyber512", "oqs", rosenpass_oqs::Kyber512);
+    }
+
+    use rosenpass_cipher_traits::primitives::Kem;
+
+    fn template<
+        const SK_LEN: usize,
+        const PK_LEN: usize,
+        const CT_LEN: usize,
+        const SHK_LEN: usize,
+        T: Kem<SK_LEN, PK_LEN, CT_LEN, SHK_LEN>,
+    >(
+        c: &mut Criterion,
+        alg_name: &str,
+        impl_name: &str,
+        scheme: T,
+    ) {
+        use super::{benchid, KvPair, KvPairs};
+
+        let base = [
+            KvPair("primitive", "kem"),
+            KvPair("algorithm", alg_name),
+            KvPair("implementation", impl_name),
+            KvPair("length", "-1"),
+        ];
+
+        let kem_benchid = |op| benchid(KvPairs(&base), KvPairs(&[KvPair("operation", op)]));
+
+        c.bench_function(&kem_benchid("keygen"), |bench| {
+            let mut sk = [0; SK_LEN];
+            let mut pk = [0; PK_LEN];
+
+            bench.iter(|| {
+                scheme.keygen(&mut sk, &mut pk).unwrap();
+            });
+        });
+
+        c.bench_function(&kem_benchid("encaps"), |bench| {
+            let mut sk = [0; SK_LEN];
+            let mut pk = [0; PK_LEN];
+            let mut ct = [0; CT_LEN];
+            let mut shk = [0; SHK_LEN];
+
+            scheme.keygen(&mut sk, &mut pk).unwrap();
+
+            bench.iter(|| {
+                scheme.encaps(&mut shk, &mut ct, &pk).unwrap();
+            });
+        });
+
+        c.bench_function(&kem_benchid("decaps"), |bench| {
+            let mut sk = [0; SK_LEN];
+            let mut pk = [0; PK_LEN];
+            let mut ct = [0; CT_LEN];
+            let mut shk = [0; SHK_LEN];
+            let mut shk2 = [0; SHK_LEN];
+
+            scheme.keygen(&mut sk, &mut pk).unwrap();
+            scheme.encaps(&mut shk, &mut ct, &pk).unwrap();
+
+            bench.iter(|| {
+                scheme.decaps(&mut shk2, &sk, &ct).unwrap();
+            });
+        });
+    }
+}
+mod aead {
+    criterion::criterion_group!(
+        benches,
+        bench_chachapoly_libcrux,
+        bench_chachapoly_rustcrypto,
+        bench_xchachapoly_rustcrypto,
+    );
+
+    use criterion::Criterion;
+
+    const KEY_LEN: usize = rosenpass_ciphers::Aead::KEY_LEN;
+    const TAG_LEN: usize = rosenpass_ciphers::Aead::TAG_LEN;
+
+    fn bench_xchachapoly_rustcrypto(c: &mut Criterion) {
+        template(
+            c,
+            "xchacha20poly1305",
+            "rustcrypto",
+            rosenpass_ciphers::subtle::rust_crypto::xchacha20poly1305_ietf::XChaCha20Poly1305,
+        );
+    }
+
+    fn bench_chachapoly_rustcrypto(c: &mut Criterion) {
+        template(
+            c,
+            "chacha20poly1305",
+            "rustcrypto",
+            rosenpass_ciphers::subtle::rust_crypto::chacha20poly1305_ietf::ChaCha20Poly1305,
+        );
+    }
+
+    fn bench_chachapoly_libcrux(c: &mut Criterion) {
+        template(
+            c,
+            "chacha20poly1305",
+            "libcrux",
+            rosenpass_ciphers::subtle::libcrux::chacha20poly1305_ietf::ChaCha20Poly1305,
+        );
+    }
+
+    use rosenpass_cipher_traits::primitives::Aead;
+
+    fn template<const NONCE_LEN: usize, T: Aead<KEY_LEN, NONCE_LEN, TAG_LEN>>(
+        c: &mut Criterion,
+        alg_name: &str,
+        impl_name: &str,
+        scheme: T,
+    ) {
+        use crate::{benchid, KvPair, KvPairs};
+
+        let base = [
+            KvPair("primitive", "aead"),
+            KvPair("algorithm", alg_name),
+            KvPair("implementation", impl_name),
+        ];
+        let aead_benchid = |op, len| {
+            benchid(
+                KvPairs(&base),
+                KvPairs(&[KvPair("operation", op), KvPair("length", len)]),
+            )
+        };
+
+        let key = [12; KEY_LEN];
+        let nonce = [23; NONCE_LEN];
+        let ad = [];
+
+        c.bench_function(&aead_benchid("encrypt", "32byte"), |bench| {
+            const DATA_LEN: usize = 32;
+
+            let ptxt = [34u8; DATA_LEN];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+
+            bench.iter(|| {
+                scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+            });
+        });
+
+        c.bench_function(&aead_benchid("decrypt", "32byte"), |bench| {
+            const DATA_LEN: usize = 32;
+
+            let ptxt = [34u8; DATA_LEN];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+            let mut ptxt_out = [0u8; DATA_LEN];
+
+            scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+
+            bench.iter(|| {
+                scheme
+                    .decrypt(&mut ptxt_out, &key, &nonce, &ad, &mut ctxt)
+                    .unwrap()
+            })
+        });
+
+        c.bench_function(&aead_benchid("encrypt", "1024byte"), |bench| {
+            const DATA_LEN: usize = 1024;
+
+            let ptxt = [34u8; DATA_LEN];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+
+            bench.iter(|| {
+                scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+            });
+        });
+        c.bench_function(&aead_benchid("decrypt", "1024byte"), |bench| {
+            const DATA_LEN: usize = 1024;
+
+            let ptxt = [34u8; DATA_LEN];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+            let mut ptxt_out = [0u8; DATA_LEN];
+
+            scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+
+            bench.iter(|| {
+                scheme
+                    .decrypt(&mut ptxt_out, &key, &nonce, &ad, &mut ctxt)
+                    .unwrap()
+            })
+        });
+    }
+}
+
+mod keyed_hash {
+    criterion::criterion_group!(
+        benches,
+        bench_blake2b_rustcrypto,
+        bench_blake2b_libcrux,
+        bench_shake256_rustcrypto,
+    );
+
+    const KEY_LEN: usize = 32;
+    const HASH_LEN: usize = 32;
+
+    use criterion::Criterion;
+
+    fn bench_shake256_rustcrypto(c: &mut Criterion) {
+        template(
+            c,
+            "shake256",
+            "rustcrypto",
+            &rosenpass_ciphers::subtle::rust_crypto::keyed_shake256::SHAKE256Core,
+        );
+    }
+
+    fn bench_blake2b_rustcrypto(c: &mut Criterion) {
+        template(
+            c,
+            "blake2b",
+            "rustcrypto",
+            &rosenpass_ciphers::subtle::rust_crypto::blake2b::Blake2b,
+        );
+    }
+
+    fn bench_blake2b_libcrux(c: &mut Criterion) {
+        template(
+            c,
+            "blake2b",
+            "libcrux",
+            &rosenpass_ciphers::subtle::libcrux::blake2b::Blake2b,
+        );
+    }
+
+    use rosenpass_cipher_traits::primitives::KeyedHash;
+
+    fn template<H: KeyedHash<KEY_LEN, HASH_LEN>>(
+        c: &mut Criterion,
+        alg_name: &str,
+        impl_name: &str,
+        _: &H,
+    ) where
+        H::Error: std::fmt::Debug,
+    {
+        use crate::{benchid, KvPair, KvPairs};
+
+        let key = [12u8; KEY_LEN];
+        let mut out = [0u8; HASH_LEN];
+
+        let base = [
+            KvPair("primitive", "keyedhash"),
+            KvPair("algorithm", alg_name),
+            KvPair("implementation", impl_name),
+            KvPair("operation", "hash"),
+        ];
+        let keyedhash_benchid = |len| benchid(KvPairs(&base), KvPairs(&[KvPair("length", len)]));
+
+        c.bench_function(&keyedhash_benchid("32byte"), |bench| {
+            let bytes = [34u8; 32];
+
+            bench.iter(|| {
+                H::keyed_hash(&key, &bytes, &mut out).unwrap();
+            })
+        })
+        .bench_function(&keyedhash_benchid("64byte"), |bench| {
+            let bytes = [34u8; 64];
+
+            bench.iter(|| {
+                H::keyed_hash(&key, &bytes, &mut out).unwrap();
+            })
+        })
+        .bench_function(&keyedhash_benchid("128byte"), |bench| {
+            let bytes = [34u8; 128];
+
+            bench.iter(|| {
+                H::keyed_hash(&key, &bytes, &mut out).unwrap();
+            })
+        })
+        .bench_function(&keyedhash_benchid("1024byte"), |bench| {
+            let bytes = [34u8; 1024];
+
+            bench.iter(|| {
+                H::keyed_hash(&key, &bytes, &mut out).unwrap();
+            })
+        });
+    }
+}
+
+mod templates {}
diff --git a/ciphers/src/subtle/libcrux/mod.rs b/ciphers/src/subtle/libcrux/mod.rs
index 432bd5f..210bc61 100644
--- a/ciphers/src/subtle/libcrux/mod.rs
+++ b/ciphers/src/subtle/libcrux/mod.rs
@@ -4,11 +4,11 @@
 //!
 //! [Github](https://github.com/cryspen/libcrux)
 
-#[cfg(feature = "experiment_libcrux_blake2")]
+#[cfg(any(feature = "experiment_libcrux_blake2", feature = "bench"))]
 pub mod blake2b;
 
-#[cfg(feature = "experiment_libcrux_chachapoly")]
+#[cfg(any(feature = "experiment_libcrux_chachapoly", feature = "bench"))]
 pub mod chacha20poly1305_ietf;
 
-#[cfg(feature = "experiment_libcrux_kyber")]
+#[cfg(any(feature = "experiment_libcrux_kyber", feature = "bench"))]
 pub mod kyber512;
diff --git a/ciphers/src/subtle/mod.rs b/ciphers/src/subtle/mod.rs
index b3c3aa8..6221324 100644
--- a/ciphers/src/subtle/mod.rs
+++ b/ciphers/src/subtle/mod.rs
@@ -11,6 +11,7 @@ pub mod rust_crypto;
 #[cfg(any(
     feature = "experiment_libcrux_blake2",
     feature = "experiment_libcrux_chachapoly",
-    feature = "experiment_libcrux_kyber"
+    feature = "experiment_libcrux_kyber",
+    feature = "bench"
 ))]
 pub mod libcrux;
diff --git a/flake.nix b/flake.nix
index 68ae590..4b3413f 100644
--- a/flake.nix
+++ b/flake.nix
@@ -89,6 +89,7 @@
         [
           "x86_64-linux"
           "aarch64-linux"
+          "i686-linux"
         ]
         (
           system:
@@ -172,6 +173,15 @@
                 inherit (pkgs.cargo-llvm-cov) LLVM_COV LLVM_PROFDATA;
               };
             };
+            devShells.benchmark = pkgs.mkShell {
+              inputsFrom = [ pkgs.rosenpass ];
+              nativeBuildInputs = let
+                      rustToolchain = (inputs.fenix.packages.${system}.toolchainOf {
+                          channel = "1.77.0";
+                          sha256 = "sha256-+syqAd2kX8KVa8/U2gz3blIQTTsYYt3U63xBWaGOSc8=";
+                      });
+                  in [ rustToolchain.toolchain ];
+            };
 
             checks =
               {
diff --git a/pkgs/rosenpass.nix b/pkgs/rosenpass.nix
index 30f560f..c25809f 100644
--- a/pkgs/rosenpass.nix
+++ b/pkgs/rosenpass.nix
@@ -79,6 +79,7 @@ rustPlatform.buildRustPackage {
       "memsec-0.6.3" = "sha256-4ri+IEqLd77cLcul3lZrmpDKj4cwuYJ8oPRAiQNGeLw=";
       "uds-0.4.2" = "sha256-qlxr/iJt2AV4WryePIvqm/8/MK/iqtzegztNliR93W8=";
       "libcrux-blake2-0.0.3-pre" = "sha256-0CLjuzwJqGooiODOHf5D8Hc8ClcG/XcGvVGyOVnLmJY=";
+      "libcrux-macros-0.0.3" = "sha256-Tb5uRirwhRhoFEK8uu1LvXl89h++40pxzZ+7kXe8RAI=";
     };
   };
 
diff --git a/readme.md b/readme.md
index aa4c881..c4ceb1a 100644
--- a/readme.md
+++ b/readme.md
@@ -1,3 +1,37 @@
+# Changes on This Branch
+
+This branch adds facilities for benchmarking both the Rosenpass protocol
+code and the implementations of the primitives behind it. The primitives
+are benchmarked using criterion. For the protocol code, we use a custom
+library for instrumenting the code such that events are written to a
+trace, which is then inspected after a run.
+
+## Protocol Benchmark
+
+The trace that is being written to lives in a new module
+`trace_bench` in the util crate. A basic benchmark that
+performs some minor statistical analysis of the trace can be run using
+
+```
+cargo bench -p rosenpass --bench trace_handshake -F trace_bench
+```
+
+## Primitive Benchmark
+
+Benchmarks for the functions of the traits `Kem`, `Aead` and `KeyedHash`
+have been added and are run for all implementations in the `primitives`
+benchmark of `rosenpass-ciphers`. Run the benchmarks using
+
+```
+cargo bench -p rosenpass-ciphers --bench primitives -F bench
+```
+
+Note that the `bench` feature enables the inclusion of the libcrux-backed
+trait implementations in the module tree, but does not enable them
+as default.
+
+---
+
 # Rosenpass README
 
 ![Nix](https://github.com/rosenpass/rosenpass/actions/workflows/nix.yaml/badge.svg)
@@ -14,7 +48,7 @@ This repository contains
 
 ## Getting started
 
-First, [install rosenpass](#Getting-Rosenpass). Then, check out the help functions of `rp` & `rosenpass`:
+First, [install rosenpass](#getting-rosenpass). Then, check out the help functions of `rp` & `rosenpass`:
 
 ```sh
 rp help
@@ -64,11 +98,7 @@ The analysis is implemented according to modern software engineering principles:
 The code uses a variety of optimizations to speed up analysis such as using secret functions to model trusted/malicious setup. We split the model into two separate entry points which can be analyzed in parallel. Each is much faster than both models combined.
 A wrapper script provides instant feedback about which queries execute as expected in color: A red cross if a query fails and a green check if it succeeds.
 
-[^liboqs]: https://openquantumsafe.org/liboqs/
-[^wg]: https://www.wireguard.com/
-[^pqwg]: https://eprint.iacr.org/2020/379
-[^pqwg-statedis]: Unless supplied with a pre-shared-key, but this defeats the purpose of a key exchange protocol
-[^wg-statedis]: https://lists.zx2c4.com/pipermail/wireguard/2021-August/006916.htmlA
+[^liboqs]: <https://openquantumsafe.org/liboqs/>
 
 # Getting Rosenpass
 
diff --git a/rosenpass/Cargo.toml b/rosenpass/Cargo.toml
index 2975a1f..c23d1f3 100644
--- a/rosenpass/Cargo.toml
+++ b/rosenpass/Cargo.toml
@@ -35,6 +35,11 @@ required-features = [
   "internal_bin_gen_ipc_msg_types",
 ]
 
+[[bench]]
+name = "trace_handshake"
+harness = false
+required-features = ["trace_bench"]
+
 [[bench]]
 name = "handshake"
 harness = false
@@ -72,6 +77,7 @@ command-fds = { workspace = true, optional = true }
 rustix = { workspace = true, optional = true }
 uds = { workspace = true, optional = true, features = ["mio_1xx"] }
 signal-hook = { workspace = true, optional = true }
+libcrux-test-utils = { workspace = true, optional = true }
 
 [build-dependencies]
 anyhow = { workspace = true }
@@ -106,6 +112,7 @@ experiment_api = [
 internal_signal_handling_for_coverage_reports = ["signal-hook"]
 internal_testing = []
 internal_bin_gen_ipc_msg_types = ["hex", "heck"]
+trace_bench = ["rosenpass-util/trace_bench", "dep:libcrux-test-utils"]
 
 [lints.rust]
 unexpected_cfgs = { level = "allow", check-cfg = ['cfg(coverage)'] }
diff --git a/rosenpass/benches/trace_handshake.rs b/rosenpass/benches/trace_handshake.rs
new file mode 100644
index 0000000..9adaafe
--- /dev/null
+++ b/rosenpass/benches/trace_handshake.rs
@@ -0,0 +1,371 @@
+// Standard library imports
+use std::{
+    collections::HashMap,
+    hint::black_box,
+    io::{self, Write},
+    ops::DerefMut,
+    time::{Duration, Instant},
+};
+
+// External crate imports
+use anyhow::Result;
+use libcrux_test_utils::tracing::{EventType, Trace as _};
+use rosenpass::protocol::{
+    CryptoServer, HandleMsgResult, MsgBuf, PeerPtr, ProtocolVersion, SPk, SSk, SymKey,
+};
+use rosenpass_cipher_traits::primitives::Kem;
+use rosenpass_ciphers::StaticKem;
+use rosenpass_secret_memory::secret_policy_try_use_memfd_secrets;
+use rosenpass_util::trace_bench::{RpEventType, TRACE};
+
+const ITERATIONS: usize = 100;
+
+fn handle(
+    tx: &mut CryptoServer,
+    msgb: &mut MsgBuf,
+    msgl: usize,
+    rx: &mut CryptoServer,
+    resb: &mut MsgBuf,
+) -> Result<(Option<SymKey>, Option<SymKey>)> {
+    let HandleMsgResult {
+        exchanged_with: xch,
+        resp,
+    } = rx.handle_msg(&msgb[..msgl], &mut **resb)?;
+
+    assert!(matches!(xch, None | Some(PeerPtr(0))));
+
+    let xch = xch.map(|p| rx.osk(p).unwrap());
+
+    let (rxk, txk) = resp
+        .map(|resl| handle(rx, resb, resl, tx, msgb))
+        .transpose()?
+        .unwrap_or((None, None));
+
+    assert!(rxk.is_none() || xch.is_none());
+
+    Ok((txk, rxk.or(xch)))
+}
+
+fn hs(ini: &mut CryptoServer, res: &mut CryptoServer) -> Result<()> {
+    let (mut inib, mut resb) = (MsgBuf::zero(), MsgBuf::zero());
+    let sz = ini.initiate_handshake(PeerPtr(0), &mut *inib)?;
+    let (kini, kres) = handle(ini, &mut inib, sz, res, &mut resb)?;
+    assert!(kini.unwrap().secret() == kres.unwrap().secret());
+    Ok(())
+}
+
+fn keygen() -> Result<(SSk, SPk)> {
+    let (mut sk, mut pk) = (SSk::zero(), SPk::zero());
+    StaticKem.keygen(sk.secret_mut(), pk.deref_mut())?;
+    Ok((sk, pk))
+}
+
+fn make_server_pair(protocol_version: ProtocolVersion) -> Result<(CryptoServer, CryptoServer)> {
+    let psk = SymKey::random();
+    let ((ska, pka), (skb, pkb)) = (keygen()?, keygen()?);
+    let (mut a, mut b) = (
+        CryptoServer::new(ska, pka.clone()),
+        CryptoServer::new(skb, pkb.clone()),
+    );
+    a.add_peer(Some(psk.clone()), pkb, protocol_version.clone())?;
+    b.add_peer(Some(psk), pka, protocol_version)?;
+    Ok((a, b))
+}
+
+fn main() {
+    // Attempt to use memfd_secrets for storing sensitive key material
+    secret_policy_try_use_memfd_secrets();
+
+    // Run protocol for V02
+    let (mut a_v02, mut b_v02) = make_server_pair(ProtocolVersion::V02).unwrap();
+    for _ in 0..ITERATIONS {
+        hs(black_box(&mut a_v02), black_box(&mut b_v02)).unwrap();
+    }
+
+    // Emit a marker event to separate V02 and V03 trace sections
+    TRACE.emit_on_the_fly("start-hs-v03");
+
+    // Run protocol for V03
+    let (mut a_v03, mut b_v03) = make_server_pair(ProtocolVersion::V03).unwrap();
+    for _ in 0..ITERATIONS {
+        hs(black_box(&mut a_v03), black_box(&mut b_v03)).unwrap();
+    }
+
+    // Collect the trace events generated during the handshakes
+    let trace: Vec<_> = TRACE.clone().report();
+
+    // Split the trace into V02 and V03 sections based on the marker
+    let (trace_v02, trace_v03) = {
+        let cutoff = trace
+            .iter()
+            .position(|entry| entry.label == "start-hs-v03")
+            .unwrap();
+        // Exclude the marker itself from the V03 trace
+        let (v02, v03_with_marker) = trace.split_at(cutoff);
+        (v02, &v03_with_marker[1..])
+    };
+
+    // Perform statistical analysis on both trace sections and write results as JSON
+    write_json_arrays(
+        &mut std::io::stdout(), // Write to standard output
+        vec![
+            ("V02", statistical_analysis(trace_v02.to_vec())),
+            ("V03", statistical_analysis(trace_v03.to_vec())),
+        ],
+    )
+    .expect("error writing json data");
+}
+
+/// Takes a vector of trace events, bins them by label, extracts durations,
+/// filters empty bins, calculates aggregate statistics (mean, std dev), and returns them.
+fn statistical_analysis(trace: Vec<RpEventType>) -> Vec<(&'static str, AggregateStat<Duration>)> {
+    bin_events(trace)
+        .into_iter()
+        .map(|(label, spans)| (label, extract_span_durations(label, spans.as_slice())))
+        .filter(|(_, durations)| !durations.is_empty())
+        .map(|(label, durations)| (label, AggregateStat::analyze_durations(&durations)))
+        .collect()
+}
+
+/// Takes an iterator of ("protocol_version", iterator_of_stats) pairs and writes them
+/// as a single flat JSON array to the provided writer.
+///
+/// # Arguments
+/// * `w` - The writer to output JSON to (e.g., stdout, file).
+/// * `item_groups` - An iterator producing tuples of (`&'static str`, `II`), where
+///   `II` is itself an iterator producing (`&'static str`, `AggregateStat<Duration>`).
+///   Represents the protocol_version name and the statistics items within that protocol_version.
+///
+/// # Type Parameters
+/// * `W` - A type that implements `std::io::Write`.
+/// * `II` - An iterator type yielding (`&'static str`, `AggregateStat<Duration>`).
+fn write_json_arrays<W: Write, II: IntoIterator<Item = (&'static str, AggregateStat<Duration>)>>(
+    w: &mut W,
+    item_groups: impl IntoIterator<Item = (&'static str, II)>,
+) -> io::Result<()> {
+    // Flatten the groups into a single iterator of (protocol_version, label, stats)
+    let iter = item_groups.into_iter().flat_map(|(version, items)| {
+        items
+            .into_iter()
+            .map(move |(label, agg_stat)| (version, label, agg_stat))
+    });
+    let mut delim = ""; // Start with no delimiter
+
+    // Start the JSON array
+    write!(w, "[")?;
+
+    // Write the flattened statistics as JSON objects, separated by commas.
+    for (version, label, agg_stat) in iter {
+        write!(w, "{delim}")?; // Write delimiter (empty for first item, "," for subsequent)
+        agg_stat.write_json_ns(label, version, w)?; // Write the JSON object for the stat entry
+        delim = ","; // Set delimiter for the next iteration
+    }
+
+    // End the JSON array
+    write!(w, "]")
+}
+
+/// Used to group benchmark results in visualizations
+enum RunTimeGroup {
+    /// For particularly long operations.
+    Long,
+    /// Operations of moderate duration.
+    Medium,
+    /// Operations expected to complete under a millisecond.
+    BelowMillisec,
+    /// Very fast operations, likely under a microsecond.
+    BelowMicrosec,
+}
+
+impl std::fmt::Display for RunTimeGroup {
+    /// Used when writing the group information to JSON output.
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let txt = match self {
+            RunTimeGroup::Long => "long",
+            RunTimeGroup::Medium => "medium",
+            RunTimeGroup::BelowMillisec => "below_ms",
+            RunTimeGroup::BelowMicrosec => "below_us",
+        };
+        write!(f, "{txt}")
+    }
+}
+
+/// Maps specific internal timing labels (likely from rosenpass internals)
+/// to the broader SpanGroup categories.
+fn run_time_group(label: &str) -> RunTimeGroup {
+    match label {
+        // Explicitly categorized labels based on expected performance characteristics
+        "handle_init_hello" | "handle_resp_hello" | "RHI5" | "IHR5" => RunTimeGroup::Long,
+        "RHR1" | "IHI2" | "ICR6" => RunTimeGroup::BelowMicrosec,
+        "RHI6" | "ICI7" | "ICR7" | "RHR3" | "ICR3" | "IHR8" | "ICI4" | "RHI3" | "RHI4" | "RHR4"
+        | "RHR7" | "ICI3" | "IHI3" | "IHI8" | "ICR2" | "ICR4" | "IHR4" | "IHR6" | "IHI4"
+        | "RHI7" => RunTimeGroup::BelowMillisec,
+        // Default protocol_version for any other labels
+        _ => RunTimeGroup::Medium,
+    }
+}
+
+/// Used temporarily within `extract_span_durations` to track open spans
+/// and calculated durations.
+#[derive(Debug, Clone)]
+enum StatEntry {
+    /// Represents an unmatched SpanOpen event with its timestamp.
+    Start(Instant),
+    /// Represents a completed span with its calculated duration.
+    Duration(Duration),
+}
+
+/// Takes a flat list of events and organizes them into a HashMap where keys
+/// are event labels and values are vectors of events with that label.
+fn bin_events(events: Vec<RpEventType>) -> HashMap<&'static str, Vec<RpEventType>> {
+    let mut spans = HashMap::<_, Vec<_>>::new();
+    for event in events {
+        // Get the vector for the event's label, or create a new one
+        let spans_for_label = spans.entry(event.label).or_default();
+        // Add the event to the vector
+        spans_for_label.push(event);
+    }
+    spans
+}
+
+/// Processes a list of events (assumed to be for the same label), matching
+/// `SpanOpen` and `SpanClose` events to calculate the duration of each span.
+/// It handles potentially interleaved spans correctly.
+fn extract_span_durations(label: &str, events: &[RpEventType]) -> Vec<Duration> {
+    let mut processing_list: Vec<StatEntry> = vec![]; // List to track open spans and final durations
+
+    for entry in events {
+        match &entry.ty {
+            EventType::SpanOpen => {
+                // Record the start time of a new span
+                processing_list.push(StatEntry::Start(entry.at));
+            }
+            EventType::SpanClose => {
+                // Find the most recent unmatched 'Start' entry
+                let start_index = processing_list
+                    .iter()
+                    .rposition(|span| matches!(span, StatEntry::Start(_))); // Find last Start
+
+                match start_index {
+                    Some(index) => {
+                        // Retrieve the start time
+                        let start_time = match processing_list[index] {
+                            StatEntry::Start(t) => t,
+                            _ => unreachable!(), // Should always be Start based on rposition logic
+                        };
+                        // Calculate duration and replace the 'Start' entry with 'Duration'
+                        processing_list[index] = StatEntry::Duration(entry.at - start_time);
+                    }
+                    None => {
+                        // This should not happen with well-formed traces
+                        eprintln!(
+                            "Warning: Found SpanClose without a matching SpanOpen for label '{}': {:?}",
+                            label, entry
+                        );
+                    }
+                }
+            }
+            EventType::OnTheFly => {
+                // Ignore OnTheFly events for duration calculation
+            }
+        }
+    }
+
+    // Collect all calculated durations, reporting any unmatched starts
+    processing_list
+        .into_iter()
+        .filter_map(|span| match span {
+            StatEntry::Start(at) => {
+                // Report error if a span was opened but never closed
+                eprintln!(
+                    "Warning: Unmatched SpanOpen at {:?} for label '{}'",
+                    at, label
+                );
+                None // Discard unmatched starts
+            }
+            StatEntry::Duration(dur) => Some(dur), // Keep calculated durations
+        })
+        .collect()
+}
+
+/// Stores the mean, standard deviation, relative standard deviation (sd/mean),
+/// and the number of samples used for calculation.
+#[derive(Debug)]
+struct AggregateStat<T> {
+    /// Average duration.
+    mean_duration: T,
+    /// Standard deviation of durations.
+    sd_duration: T,
+    /// Standard deviation as a percentage of the mean.
+    sd_by_mean: String,
+    /// Number of duration measurements.
+    sample_size: usize,
+}
+
+impl AggregateStat<Duration> {
+    /// Calculates mean, variance, and standard deviation for a slice of Durations.
+    fn analyze_durations(durations: &[Duration]) -> Self {
+        let sample_size = durations.len();
+        assert!(sample_size > 0, "Cannot analyze empty duration slice");
+
+        // Calculate the sum of durations
+        let sum: Duration = durations.iter().sum();
+        // Calculate the mean duration
+        let mean = sum / (sample_size as u32);
+
+        // Calculate mean in nanoseconds, adding 1 to avoid potential division by zero later
+        // (though highly unlikely with realistic durations)
+        let mean_ns = mean.as_nanos().saturating_add(1);
+
+        // Calculate variance (sum of squared differences from the mean) / N
+        let variance = durations
+            .iter()
+            .map(Duration::as_nanos)
+            .map(|d_ns| d_ns.abs_diff(mean_ns).pow(2)) // (duration_ns - mean_ns)^2
+            .sum::<u128>() // Sum of squares
+            / (sample_size as u128); // Divide by sample size
+
+        // Calculate standard deviation (sqrt of variance)
+        let sd_ns = (variance as f64).sqrt() as u128;
+        let sd = Duration::from_nanos(sd_ns as u64); // Convert back to Duration
+
+        // Calculate relative standard deviation (sd / mean) as a percentage string
+        let sd_rel_permille = (10000 * sd_ns).checked_div(mean_ns).unwrap_or(0); // Calculate sd/mean * 10000
+        let sd_rel_formatted = format!("{}.{:02}%", sd_rel_permille / 100, sd_rel_permille % 100);
+
+        AggregateStat {
+            mean_duration: mean,
+            sd_duration: sd,
+            sd_by_mean: sd_rel_formatted,
+            sample_size,
+        }
+    }
+
+    /// Writes the statistics as a JSON object to the provided writer.
+    /// Includes metadata like label, protocol_version, OS, architecture, and run time group.
+    ///
+    /// # Arguments
+    /// * `label` - The specific benchmark/span label.
+    /// * `protocol_version` - Version of the protocol that is benchmarked.
+    /// * `w` - The output writer (must implement `std::io::Write`).
+    fn write_json_ns(
+        &self,
+        label: &str,
+        protocol_version: &str,
+        w: &mut impl io::Write,
+    ) -> io::Result<()> {
+        // Format the JSON string using measured values and environment constants
+        writeln!(
+            w,
+            r#"{{"name":"{name}", "unit":"ns/iter", "value":"{value}", "range":"± {range}", "protocol version":"{protocol_version}", "sample size":"{sample_size}", "os":"{os}", "arch":"{arch}", "run time":"{run_time}"}}"#,
+            name = label,                          // Benchmark name
+            value = self.mean_duration.as_nanos(), // Mean duration in nanoseconds
+            range = self.sd_duration.as_nanos(),   // Standard deviation in nanoseconds
+            sample_size = self.sample_size,        // Number of samples
+            os = std::env::consts::OS,             // Operating system
+            arch = std::env::consts::ARCH,         // CPU architecture
+            run_time = run_time_group(label),      // Run time group category (long, medium, etc.)
+            protocol_version = protocol_version // Overall protocol_version (e.g., protocol version)
+        )
+    }
+}
diff --git a/rosenpass/src/protocol/protocol.rs b/rosenpass/src/protocol/protocol.rs
index c23a9c8..5a5e7c5 100644
--- a/rosenpass/src/protocol/protocol.rs
+++ b/rosenpass/src/protocol/protocol.rs
@@ -16,7 +16,6 @@ use std::{
 };
 
 use anyhow::{bail, ensure, Context, Result};
-use rand::Fill as Randomize;
 
 use crate::{hash_domains, msgs::*, RosenpassError};
 use memoffset::span_of;
@@ -3547,9 +3546,27 @@ impl CryptoServer {
     }
 }
 
+/// Marks a section of the protocol using the same identifiers as are used in the whitepaper.
+/// When building with the trace benchmarking feature enabled, this also emits span events into the
+/// trace, which allows reconstructing the run times of the individual sections for performace
+/// measurement.
+macro_rules! protocol_section {
+    ($label:expr, $body:tt) => {{
+        #[cfg(feature = "trace_bench")]
+        let _span_raii_handle = rosenpass_util::trace_bench::TRACE.emit_span($label);
+
+        #[allow(unused_braces)]
+        $body
+    }};
+}
+
 impl CryptoServer {
     /// Core cryptographic protocol implementation: Kicks of the handshake
     /// on the initiator side, producing the InitHello message.
+    #[cfg_attr(
+        feature = "trace_bench",
+        rosenpass_util::trace_bench::trace_span("handle_initiation", rosenpass_util::trace_bench::TRACE)
+    )]
     pub fn handle_initiation(&mut self, peer: PeerPtr, ih: &mut InitHello) -> Result<PeerPtr> {
         let mut hs = InitiatorHandshake::zero_with_timestamp(
             self,
@@ -3557,37 +3574,53 @@ impl CryptoServer {
         );
 
         // IHI1
-        hs.core.init(peer.get(self).spkt.deref())?;
+        protocol_section!("IHI1", {
+            hs.core.init(peer.get(self).spkt.deref())?;
+        });
 
         // IHI2
-        hs.core.sidi.randomize();
-        ih.sidi.copy_from_slice(&hs.core.sidi.value);
+        protocol_section!("IHI2", {
+            hs.core.sidi.randomize();
+            ih.sidi.copy_from_slice(&hs.core.sidi.value);
+        });
 
         // IHI3
-        EphemeralKem.keygen(hs.eski.secret_mut(), &mut *hs.epki)?;
-        ih.epki.copy_from_slice(&hs.epki.value);
+        protocol_section!("IHI3", {
+            EphemeralKem.keygen(hs.eski.secret_mut(), &mut *hs.epki)?;
+            ih.epki.copy_from_slice(&hs.epki.value);
+        });
 
         // IHI4
-        hs.core.mix(ih.sidi.as_slice())?.mix(ih.epki.as_slice())?;
+        protocol_section!("IHI4", {
+            hs.core.mix(ih.sidi.as_slice())?.mix(ih.epki.as_slice())?;
+        });
 
         // IHI5
-        hs.core
-            .encaps_and_mix(&StaticKem, &mut ih.sctr, peer.get(self).spkt.deref())?;
+        protocol_section!("IHI5", {
+            hs.core
+                .encaps_and_mix(&StaticKem, &mut ih.sctr, peer.get(self).spkt.deref())?;
+        });
 
         // IHI6
-        hs.core.encrypt_and_mix(
-            ih.pidic.as_mut_slice(),
-            self.pidm(peer.get(self).protocol_version.keyed_hash())?
-                .as_ref(),
-        )?;
+        protocol_section!("IHI6", {
+            hs.core.encrypt_and_mix(
+                ih.pidic.as_mut_slice(),
+                self.pidm(peer.get(self).protocol_version.keyed_hash())?
+                    .as_ref(),
+            )?;
+        });
 
         // IHI7
-        hs.core
-            .mix(self.spkm.deref())?
-            .mix(peer.get(self).psk.secret())?;
+        protocol_section!("IHI7", {
+            hs.core
+                .mix(self.spkm.deref())?
+                .mix(peer.get(self).psk.secret())?;
+        });
 
         // IHI8
-        hs.core.encrypt_and_mix(ih.auth.as_mut_slice(), &[])?;
+        protocol_section!("IHI8", {
+            hs.core.encrypt_and_mix(ih.auth.as_mut_slice(), &[])?;
+        });
 
         // Update the handshake hash last (not changing any state on prior error
         peer.hs().insert(self, hs)?;
@@ -3597,6 +3630,10 @@ impl CryptoServer {
 
     /// Core cryptographic protocol implementation: Parses an [InitHello] message and produces a
     /// [RespHello] message on the responder side.
+    #[cfg_attr(
+        feature = "trace_bench",
+        rosenpass_util::trace_bench::trace_span("handle_init_hello", rosenpass_util::trace_bench::TRACE)
+    )]
     pub fn handle_init_hello(
         &mut self,
         ih: &InitHello,
@@ -3608,54 +3645,80 @@ impl CryptoServer {
         core.sidi = SessionId::from_slice(&ih.sidi);
 
         // IHR1
-        core.init(self.spkm.deref())?;
+        protocol_section!("IHR1", {
+            core.init(self.spkm.deref())?;
+        });
 
         // IHR4
-        core.mix(&ih.sidi)?.mix(&ih.epki)?;
+        protocol_section!("IHR4", {
+            core.mix(&ih.sidi)?.mix(&ih.epki)?;
+        });
 
         // IHR5
-        core.decaps_and_mix(&StaticKem, self.sskm.secret(), self.spkm.deref(), &ih.sctr)?;
+        protocol_section!("IHR5", {
+            core.decaps_and_mix(&StaticKem, self.sskm.secret(), self.spkm.deref(), &ih.sctr)?;
+        });
 
         // IHR6
-        let peer = {
+        let peer = protocol_section!("IHR6", {
             let mut peerid = PeerId::zero();
             core.decrypt_and_mix(&mut *peerid, &ih.pidic)?;
             self.find_peer(peerid)
                 .with_context(|| format!("No such peer {peerid:?}."))?
-        };
+        });
 
         // IHR7
-        core.mix(peer.get(self).spkt.deref())?
-            .mix(peer.get(self).psk.secret())?;
+        protocol_section!("IHR7", {
+            core.mix(peer.get(self).spkt.deref())?
+                .mix(peer.get(self).psk.secret())?;
+        });
 
         // IHR8
-        core.decrypt_and_mix(&mut [0u8; 0], &ih.auth)?;
+        protocol_section!("IHR8", {
+            core.decrypt_and_mix(&mut [0u8; 0], &ih.auth)?;
+        });
 
         // RHR1
-        core.sidr.randomize();
-        rh.sidi.copy_from_slice(core.sidi.as_ref());
-        rh.sidr.copy_from_slice(core.sidr.as_ref());
+        protocol_section!("RHR1", {
+            core.sidr.randomize();
+            rh.sidi.copy_from_slice(core.sidi.as_ref());
+            rh.sidr.copy_from_slice(core.sidr.as_ref());
+        });
 
         // RHR3
-        core.mix(&rh.sidr)?.mix(&rh.sidi)?;
+        protocol_section!("RHR3", {
+            core.mix(&rh.sidr)?.mix(&rh.sidi)?;
+        });
 
         // RHR4
-        core.encaps_and_mix(&EphemeralKem, &mut rh.ecti, &ih.epki)?;
+        protocol_section!("RHR4", {
+            core.encaps_and_mix(&EphemeralKem, &mut rh.ecti, &ih.epki)?;
+        });
 
         // RHR5
-        core.encaps_and_mix(&StaticKem, &mut rh.scti, peer.get(self).spkt.deref())?;
+        protocol_section!("RHR5", {
+            core.encaps_and_mix(&StaticKem, &mut rh.scti, peer.get(self).spkt.deref())?;
+        });
 
         // RHR6
-        core.store_biscuit(self, peer, &mut rh.biscuit)?;
+        protocol_section!("RHR6", {
+            core.store_biscuit(self, peer, &mut rh.biscuit)?;
+        });
 
         // RHR7
-        core.encrypt_and_mix(&mut rh.auth, &[])?;
+        protocol_section!("RHR7", {
+            core.encrypt_and_mix(&mut rh.auth, &[])?;
+        });
 
         Ok(peer)
     }
 
     /// Core cryptographic protocol implementation: Parses an [RespHello] message and produces an
     /// [InitConf] message on the initiator side.
+    #[cfg_attr(
+        feature = "trace_bench",
+        rosenpass_util::trace_bench::trace_span("handle_resp_hello", rosenpass_util::trace_bench::TRACE)
+    )]
     pub fn handle_resp_hello(&mut self, rh: &RespHello, ic: &mut InitConf) -> Result<PeerPtr> {
         // RHI2
         let peer = self
@@ -3700,24 +3763,34 @@ impl CryptoServer {
         //       to save us from the repetitive secret unwrapping
 
         // RHI3
-        core.mix(&rh.sidr)?.mix(&rh.sidi)?;
+        protocol_section!("RHI3", {
+            core.mix(&rh.sidr)?.mix(&rh.sidi)?;
+        });
 
         // RHI4
-        core.decaps_and_mix(
-            &EphemeralKem,
-            hs!().eski.secret(),
-            hs!().epki.deref(),
-            &rh.ecti,
-        )?;
+        protocol_section!("RHI4", {
+            core.decaps_and_mix(
+                &EphemeralKem,
+                hs!().eski.secret(),
+                hs!().epki.deref(),
+                &rh.ecti,
+            )?;
+        });
 
         // RHI5
-        core.decaps_and_mix(&StaticKem, self.sskm.secret(), self.spkm.deref(), &rh.scti)?;
+        protocol_section!("RHI5", {
+            core.decaps_and_mix(&StaticKem, self.sskm.secret(), self.spkm.deref(), &rh.scti)?;
+        });
 
         // RHI6
-        core.mix(&rh.biscuit)?;
+        protocol_section!("RHI6", {
+            core.mix(&rh.biscuit)?;
+        });
 
         // RHI7
-        core.decrypt_and_mix(&mut [0u8; 0], &rh.auth)?;
+        protocol_section!("RHI7", {
+            core.decrypt_and_mix(&mut [0u8; 0], &rh.auth)?;
+        });
 
         // TODO: We should just authenticate the entire network package up to the auth
         // tag as a pattern instead of mixing in fields separately
@@ -3726,27 +3799,33 @@ impl CryptoServer {
         ic.sidr.copy_from_slice(&rh.sidr);
 
         // ICI3
-        core.mix(&ic.sidi)?.mix(&ic.sidr)?;
-        ic.biscuit.copy_from_slice(&rh.biscuit);
+        protocol_section!("ICI3", {
+            core.mix(&ic.sidi)?.mix(&ic.sidr)?;
+            ic.biscuit.copy_from_slice(&rh.biscuit);
+        });
 
         // ICI4
-        core.encrypt_and_mix(&mut ic.auth, &[])?;
+        protocol_section!("ICI4", {
+            core.encrypt_and_mix(&mut ic.auth, &[])?;
+        });
 
         // Split() – We move the secrets into the session; we do not
         // delete the InitiatorHandshake, just clear it's secrets because
         // we still need it for InitConf message retransmission to function.
 
         // ICI7
-        peer.session().insert(
-            self,
-            core.enter_live(
+        protocol_section!("ICI7", {
+            peer.session().insert(
                 self,
-                HandshakeRole::Initiator,
-                peer.get(self).protocol_version.keyed_hash(),
-            )?,
-        )?;
-        hs_mut!().core.erase();
-        hs_mut!().next = HandshakeStateMachine::RespConf;
+                core.enter_live(
+                    self,
+                    HandshakeRole::Initiator,
+                    peer.get(self).protocol_version.keyed_hash(),
+                )?,
+            )?;
+            hs_mut!().core.erase();
+            hs_mut!().next = HandshakeStateMachine::RespConf;
+        });
 
         Ok(peer)
     }
@@ -3756,6 +3835,10 @@ impl CryptoServer {
     ///
     /// This concludes the handshake on the cryptographic level; the [EmptyData] message is just
     /// an acknowledgement message telling the initiator to stop performing retransmissions.
+    #[cfg_attr(
+        feature = "trace_bench",
+        rosenpass_util::trace_bench::trace_span("handle_init_conf", rosenpass_util::trace_bench::TRACE)
+    )]
     pub fn handle_init_conf(
         &mut self,
         ic: &InitConf,
@@ -3764,22 +3847,30 @@ impl CryptoServer {
     ) -> Result<PeerPtr> {
         // (peer, bn) ← LoadBiscuit(InitConf.biscuit)
         // ICR1
-        let (peer, biscuit_no, mut core) = HandshakeState::load_biscuit(
-            self,
-            &ic.biscuit,
-            SessionId::from_slice(&ic.sidi),
-            SessionId::from_slice(&ic.sidr),
-            keyed_hash,
-        )?;
+        let (peer, biscuit_no, mut core) = protocol_section!("ICR1", {
+            HandshakeState::load_biscuit(
+                self,
+                &ic.biscuit,
+                SessionId::from_slice(&ic.sidi),
+                SessionId::from_slice(&ic.sidr),
+                keyed_hash,
+            )?
+        });
 
         // ICR2
-        core.encrypt_and_mix(&mut [0u8; Aead::TAG_LEN], &[])?;
+        protocol_section!("ICR2", {
+            core.encrypt_and_mix(&mut [0u8; Aead::TAG_LEN], &[])?;
+        });
 
         // ICR3
-        core.mix(&ic.sidi)?.mix(&ic.sidr)?;
+        protocol_section!("ICR3", {
+            core.mix(&ic.sidi)?.mix(&ic.sidr)?;
+        });
 
         // ICR4
-        core.decrypt_and_mix(&mut [0u8; 0], &ic.auth)?;
+        protocol_section!("ICR4", {
+            core.decrypt_and_mix(&mut [0u8; 0], &ic.auth)?;
+        });
 
         // ICR5
         // Defense against replay attacks; implementations may accept
@@ -3791,20 +3882,24 @@ impl CryptoServer {
         );
 
         // ICR6
-        peer.get_mut(self).biscuit_used = biscuit_no;
+        protocol_section!("ICR6", {
+            peer.get_mut(self).biscuit_used = biscuit_no;
+        });
 
         // ICR7
-        peer.session().insert(
-            self,
-            core.enter_live(
+        protocol_section!("ICR7", {
+            peer.session().insert(
                 self,
-                HandshakeRole::Responder,
-                peer.get(self).protocol_version.keyed_hash(),
-            )?,
-        )?;
-        // TODO: This should be part of the protocol specification.
-        // Abort any ongoing handshake from initiator role
-        peer.hs().take(self);
+                core.enter_live(
+                    self,
+                    HandshakeRole::Responder,
+                    peer.get(self).protocol_version.keyed_hash(),
+                )?,
+            )?;
+            // TODO: This should be part of the protocol specification.
+            // Abort any ongoing handshake from initiator role
+            peer.hs().take(self);
+        });
 
         // TODO: Implementing RP should be possible without touching the live session stuff
         // TODO: I fear that this may lead to race conditions; the acknowledgement may be
@@ -3849,6 +3944,10 @@ impl CryptoServer {
     /// message then terminates the handshake.
     ///
     /// The EmptyData message is just there to tell the initiator to abort retransmissions.
+    #[cfg_attr(
+        feature = "trace_bench",
+        rosenpass_util::trace_bench::trace_span("handle_resp_conf", rosenpass_util::trace_bench::TRACE)
+    )]
     pub fn handle_resp_conf(
         &mut self,
         msg_in: &Ref<&[u8], Envelope<EmptyData>>,
@@ -3906,6 +4005,10 @@ impl CryptoServer {
     /// DOS mitigation features.
     ///
     /// See more on DOS mitigation in Rosenpass in the [whitepaper](https://rosenpass.eu/whitepaper.pdf).
+    #[cfg_attr(
+        feature = "trace_bench",
+        rosenpass_util::trace_bench::trace_span("handle_cookie_reply", rosenpass_util::trace_bench::TRACE)
+    )]
     pub fn handle_cookie_reply(&mut self, cr: &CookieReply) -> Result<PeerPtr> {
         let peer_ptr: Option<PeerPtr> = self
             .lookup_session(Public::new(cr.inner.sid))
@@ -4030,7 +4133,7 @@ pub mod testutils {
 
 #[cfg(test)]
 mod test {
-    use std::{borrow::BorrowMut, net::SocketAddrV4, ops::DerefMut, thread::sleep, time::Duration};
+    use std::{borrow::BorrowMut, net::SocketAddrV4, ops::DerefMut};
 
     use super::*;
     use serial_test::serial;
diff --git a/util/Cargo.toml b/util/Cargo.toml
index ccc91c8..99d5baf 100644
--- a/util/Cargo.toml
+++ b/util/Cargo.toml
@@ -24,6 +24,9 @@ thiserror = { workspace = true }
 mio = { workspace = true }
 tempfile = { workspace = true }
 uds = { workspace = true, optional = true, features = ["mio_1xx"] }
+libcrux-test-utils = { workspace = true, optional = true }
+lazy_static = { workspace = true, optional = true }
 
 [features]
 experiment_file_descriptor_passing = ["uds"]
+trace_bench = ["dep:libcrux-test-utils", "dep:lazy_static"]
diff --git a/util/src/lib.rs b/util/src/lib.rs
index 66d2387..43acbac 100644
--- a/util/src/lib.rs
+++ b/util/src/lib.rs
@@ -36,3 +36,6 @@ pub mod typenum;
 pub mod zerocopy;
 /// Memory wiping utilities.
 pub mod zeroize;
+/// Trace benchmarking utilities
+#[cfg(feature = "trace_bench")]
+pub mod trace_bench;
diff --git a/util/src/trace_bench.rs b/util/src/trace_bench.rs
new file mode 100644
index 0000000..d659dec
--- /dev/null
+++ b/util/src/trace_bench.rs
@@ -0,0 +1,19 @@
+use std::time::Instant;
+
+use libcrux_test_utils::tracing;
+
+lazy_static::lazy_static! {
+    /// The trace value used in all Rosepass crates.
+    pub static ref TRACE: RpTrace = RpTrace::default();
+}
+
+/// The trace type used to trace Rosenpass for performance measurement.
+pub type RpTrace = tracing::MutexTrace<&'static str, Instant>;
+
+/// The trace event type used to trace Rosenpass for performance measurement.
+pub type RpEventType = tracing::TraceEvent<&'static str, Instant>;
+
+// Re-export to make functionality availalable and callers don't need to also directly depend on
+// [`libcrux_test_utils`].
+pub use libcrux_test_utils::tracing::trace_span;
+pub use tracing::Trace;

From 196d459a2b93973ae59988875e9cc8f1e5497c60 Mon Sep 17 00:00:00 2001
From: "Jan Winkelmann (keks)" <jan@cryspen.com>
Date: Tue, 20 May 2025 10:51:49 +0200
Subject: [PATCH 02/10] fix flake.nix: no more fenix

---
 .github/workflows/bench-primitives.yml | 2 +-
 .github/workflows/bench-protocol.yml   | 2 +-
 flake.nix                              | 9 ---------
 3 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/bench-primitives.yml b/.github/workflows/bench-primitives.yml
index c2a6234..b0edaf1 100644
--- a/.github/workflows/bench-primitives.yml
+++ b/.github/workflows/bench-primitives.yml
@@ -67,7 +67,7 @@ jobs:
 
       - name: 🏃🏻‍♀️ Benchmarks (using Nix as shell)
         working-directory: ciphers
-        run: nix develop ".#devShells.${{ matrix.system }}.benchmark" --command cargo bench -F bench --bench primitives --verbose $RUST_TARGET_FLAG -- --output-format bencher | tee ../bench-primitives.txt
+        run: nix develop --command cargo bench -F bench --bench primitives --verbose $RUST_TARGET_FLAG -- --output-format bencher | tee ../bench-primitives.txt
 
       - name: Extract benchmarks
         uses: cryspen/benchmark-data-extract-transform@v2
diff --git a/.github/workflows/bench-protocol.yml b/.github/workflows/bench-protocol.yml
index 881d12d..6a66f74 100644
--- a/.github/workflows/bench-protocol.yml
+++ b/.github/workflows/bench-protocol.yml
@@ -66,7 +66,7 @@ jobs:
       # Benchmarks ...
 
       - name: 🏃🏻‍♀️ Benchmarks
-        run: nix develop ".#devShells.${{ matrix.system }}.benchmark" --command cargo bench -p rosenpass --bench trace_handshake -F trace_bench --verbose $RUST_TARGET_FLAG >bench-protocol.json
+        run: nix develop --command cargo bench -p rosenpass --bench trace_handshake -F trace_bench --verbose $RUST_TARGET_FLAG >bench-protocol.json
 
       - name: Upload benchmarks
         uses: cryspen/benchmark-upload-and-plot-action@v3
diff --git a/flake.nix b/flake.nix
index 4b3413f..8ae3fad 100644
--- a/flake.nix
+++ b/flake.nix
@@ -173,15 +173,6 @@
                 inherit (pkgs.cargo-llvm-cov) LLVM_COV LLVM_PROFDATA;
               };
             };
-            devShells.benchmark = pkgs.mkShell {
-              inputsFrom = [ pkgs.rosenpass ];
-              nativeBuildInputs = let
-                      rustToolchain = (inputs.fenix.packages.${system}.toolchainOf {
-                          channel = "1.77.0";
-                          sha256 = "sha256-+syqAd2kX8KVa8/U2gz3blIQTTsYYt3U63xBWaGOSc8=";
-                      });
-                  in [ rustToolchain.toolchain ];
-            };
 
             checks =
               {

From cf061bd0f5457b678ddb8b285011cefac2521f98 Mon Sep 17 00:00:00 2001
From: "Jan Winkelmann (keks)" <jan@cryspen.com>
Date: Tue, 20 May 2025 11:03:12 +0200
Subject: [PATCH 03/10] workflows: use arch-specific dev shell

---
 .github/workflows/bench-primitives.yml |  2 +-
 .github/workflows/bench-protocol.yml   |  2 +-
 ciphers/Cargo.toml                     | 12 ++++++------
 util/src/lib.rs                        |  6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/bench-primitives.yml b/.github/workflows/bench-primitives.yml
index b0edaf1..395bdea 100644
--- a/.github/workflows/bench-primitives.yml
+++ b/.github/workflows/bench-primitives.yml
@@ -67,7 +67,7 @@ jobs:
 
       - name: 🏃🏻‍♀️ Benchmarks (using Nix as shell)
         working-directory: ciphers
-        run: nix develop --command cargo bench -F bench --bench primitives --verbose $RUST_TARGET_FLAG -- --output-format bencher | tee ../bench-primitives.txt
+        run: nix develop ".#devShells.${{ matrix.system }}.default" --command cargo bench -F bench --bench primitives --verbose $RUST_TARGET_FLAG -- --output-format bencher | tee ../bench-primitives.txt
 
       - name: Extract benchmarks
         uses: cryspen/benchmark-data-extract-transform@v2
diff --git a/.github/workflows/bench-protocol.yml b/.github/workflows/bench-protocol.yml
index 6a66f74..752ed90 100644
--- a/.github/workflows/bench-protocol.yml
+++ b/.github/workflows/bench-protocol.yml
@@ -66,7 +66,7 @@ jobs:
       # Benchmarks ...
 
       - name: 🏃🏻‍♀️ Benchmarks
-        run: nix develop --command cargo bench -p rosenpass --bench trace_handshake -F trace_bench --verbose $RUST_TARGET_FLAG >bench-protocol.json
+        run: nix develop ".#devShells.${{ matrix.system }}.default" --command cargo bench -p rosenpass --bench trace_handshake -F trace_bench --verbose $RUST_TARGET_FLAG >bench-protocol.json
 
       - name: Upload benchmarks
         uses: cryspen/benchmark-upload-and-plot-action@v3
diff --git a/ciphers/Cargo.toml b/ciphers/Cargo.toml
index 080deeb..5a1c375 100644
--- a/ciphers/Cargo.toml
+++ b/ciphers/Cargo.toml
@@ -25,12 +25,12 @@ experiment_libcrux_chachapoly_test = [
 ]
 experiment_libcrux_kyber = ["dep:libcrux-ml-kem", "dep:rand"]
 bench = [
-    "dep:thiserror",
-    "dep:rand",
-    "dep:libcrux",
-    "dep:libcrux-blake2",
-    "dep:libcrux-ml-kem",
-    "dep:libcrux-chacha20poly1305",
+  "dep:thiserror",
+  "dep:rand",
+  "dep:libcrux",
+  "dep:libcrux-blake2",
+  "dep:libcrux-ml-kem",
+  "dep:libcrux-chacha20poly1305",
 ]
 
 [[bench]]
diff --git a/util/src/lib.rs b/util/src/lib.rs
index 43acbac..7949a3b 100644
--- a/util/src/lib.rs
+++ b/util/src/lib.rs
@@ -30,12 +30,12 @@ pub mod option;
 pub mod result;
 /// Time and duration utilities.
 pub mod time;
+/// Trace benchmarking utilities
+#[cfg(feature = "trace_bench")]
+pub mod trace_bench;
 /// Type-level numbers and arithmetic.
 pub mod typenum;
 /// Zero-copy serialization utilities.
 pub mod zerocopy;
 /// Memory wiping utilities.
 pub mod zeroize;
-/// Trace benchmarking utilities
-#[cfg(feature = "trace_bench")]
-pub mod trace_bench;

From 77b50b70b1f91ffc43162b6d0346b40753c2dd62 Mon Sep 17 00:00:00 2001
From: "Jan Winkelmann (keks)" <jan@cryspen.com>
Date: Tue, 27 May 2025 16:29:46 +0200
Subject: [PATCH 04/10] address feedback

---
 .github/workflows/bench-primitives.yml | 22 +++-----
 .github/workflows/bench-protocol.yml   | 17 ++----
 ciphers/benches/primitives.rs          |  2 -
 flake.nix                              |  8 +++
 readme.md                              | 73 ++++++++++++++------------
 rosenpass/benches/trace_handshake.rs   |  6 +--
 6 files changed, 62 insertions(+), 66 deletions(-)

diff --git a/.github/workflows/bench-primitives.yml b/.github/workflows/bench-primitives.yml
index 395bdea..d35855c 100644
--- a/.github/workflows/bench-primitives.yml
+++ b/.github/workflows/bench-primitives.yml
@@ -37,14 +37,6 @@ jobs:
 
       # Set up environment
 
-      - name: 🛠️ Config Linux x64
-        run: echo "RUST_TARGET_FLAG=--target=x86_64-unknown-linux-gnu"  > $GITHUB_ENV
-        if: ${{ matrix.system == 'x86_64-linux' }}
-
-      - name: 🛠️ Config Linux x86
-        run: echo "RUST_TARGET_FLAG=--target=i686-unknown-linux-gnu"    > $GITHUB_ENV
-        if: ${{ matrix.system == 'i686-linux' }}
-
       - name: 🛠️ Prepare Benchmark Path
         env:
           EVENT_NAME:  ${{ github.event_name }}
@@ -67,7 +59,7 @@ jobs:
 
       - name: 🏃🏻‍♀️ Benchmarks (using Nix as shell)
         working-directory: ciphers
-        run: nix develop ".#devShells.${{ matrix.system }}.default" --command cargo bench -F bench --bench primitives --verbose $RUST_TARGET_FLAG -- --output-format bencher | tee ../bench-primitives.txt
+        run: nix develop ".#devShells.${{ matrix.system }}.benchmarks" --command cargo bench -F bench --bench primitives --verbose -- --output-format bencher | tee ../bench-primitives.txt
 
       - name: Extract benchmarks
         uses: cryspen/benchmark-data-extract-transform@v2
@@ -76,22 +68,24 @@ jobs:
           tool: "cargo"
           os: ${{ matrix.system }}
           output-file-path: bench-primitives.txt
-          data-out-path: bench-primitives.json
+          data-out-path: bench-primitives-os.json
+
+      - name: Fix up 'os' label in benchmark data
+        run: jq 'map(with_entries(.key |= if . == "os" then "operating system" else . end))' <bench-primitives-os.json >bench-primitives.json
 
       - name: Upload benchmarks
         uses: cryspen/benchmark-upload-and-plot-action@v3
         with:
           name: Crypto Primitives Benchmarks
-          group-by: "os,primitive,algorithm"
-          schema: "os,primitive,algorithm,implementation,operation,length"
+          group-by: "operating system,primitive,algorithm"
+          schema: "operating system,primitive,algorithm,implementation,operation,length"
           input-data-path: bench-primitives.json
           github-token: ${{ secrets.GITHUB_TOKEN }}
           # NOTE: pushes to current repository
           gh-repository: github.com/${{ github.repository }}
-          # use the default (gh-pages) for the demo
-          #gh-pages-branch: benchmarks
           auto-push: true
           fail-on-alert: true
+          base-path: benchmarks/
 
   ciphers-primitives-bench-status:
     if: ${{ always() }}
diff --git a/.github/workflows/bench-protocol.yml b/.github/workflows/bench-protocol.yml
index 752ed90..f531651 100644
--- a/.github/workflows/bench-protocol.yml
+++ b/.github/workflows/bench-protocol.yml
@@ -37,14 +37,6 @@ jobs:
 
       # Set up environment
 
-      - name: 🛠️ Config Linux x64
-        run: echo "RUST_TARGET_FLAG=--target=x86_64-unknown-linux-gnu"  > $GITHUB_ENV
-        if: ${{ matrix.system == 'x86_64-linux' }}
-
-      - name: 🛠️ Config Linux x86
-        run: echo "RUST_TARGET_FLAG=--target=i686-unknown-linux-gnu"    > $GITHUB_ENV
-        if: ${{ matrix.system == 'i686-linux' }}
-
       - name: 🛠️ Prepare Benchmark Path
         env:
           EVENT_NAME:  ${{ github.event_name }}
@@ -66,22 +58,21 @@ jobs:
       # Benchmarks ...
 
       - name: 🏃🏻‍♀️ Benchmarks
-        run: nix develop ".#devShells.${{ matrix.system }}.default" --command cargo bench -p rosenpass --bench trace_handshake -F trace_bench --verbose $RUST_TARGET_FLAG >bench-protocol.json
+        run: nix develop ".#devShells.${{ matrix.system }}.benchmarks" --command cargo bench -p rosenpass --bench trace_handshake -F trace_bench --verbose >bench-protocol.json
 
       - name: Upload benchmarks
         uses: cryspen/benchmark-upload-and-plot-action@v3
         with:
           name: Protocol Benchmarks
-          group-by: "os,arch,protocol version,run time"
-          schema: "os,arch,protocol version,run time,name"
+          group-by: "operating system,architecture,protocol version,run time"
+          schema: "operating system,architecture,protocol version,run time,name"
           input-data-path: bench-protocol.json
           github-token: ${{ secrets.GITHUB_TOKEN }}
           # NOTE: pushes to current repository
           gh-repository: github.com/${{ github.repository }}
-          # use the default (gh-pages) for the demo
-          #gh-pages-branch: benchmarks
           auto-push: true
           fail-on-alert: true
+          base-path: benchmarks/
 
   ciphers-protocol-bench-status:
     if: ${{ always() }}
diff --git a/ciphers/benches/primitives.rs b/ciphers/benches/primitives.rs
index eb81c66..61a649b 100644
--- a/ciphers/benches/primitives.rs
+++ b/ciphers/benches/primitives.rs
@@ -342,5 +342,3 @@ mod keyed_hash {
         });
     }
 }
-
-mod templates {}
diff --git a/flake.nix b/flake.nix
index 8ae3fad..9e2a935 100644
--- a/flake.nix
+++ b/flake.nix
@@ -173,6 +173,14 @@
                 inherit (pkgs.cargo-llvm-cov) LLVM_COV LLVM_PROFDATA;
               };
             };
+            devShells.benchmarks = pkgs.mkShell {
+              inputsFrom = [ pkgs.rosenpass ];
+              nativeBuildInputs = with pkgs; [
+                cargo-release
+                clippy
+                rustfmt
+              ];
+            };
 
             checks =
               {
diff --git a/readme.md b/readme.md
index c4ceb1a..c13782b 100644
--- a/readme.md
+++ b/readme.md
@@ -1,37 +1,3 @@
-# Changes on This Branch
-
-This branch adds facilities for benchmarking both the Rosenpass protocol
-code and the implementations of the primitives behind it. The primitives
-are benchmarked using criterion. For the protocol code, we use a custom
-library for instrumenting the code such that events are written to a
-trace, which is then inspected after a run.
-
-## Protocol Benchmark
-
-The trace that is being written to lives in a new module
-`trace_bench` in the util crate. A basic benchmark that
-performs some minor statistical analysis of the trace can be run using
-
-```
-cargo bench -p rosenpass --bench trace_handshake -F trace_bench
-```
-
-## Primitive Benchmark
-
-Benchmarks for the functions of the traits `Kem`, `Aead` and `KeyedHash`
-have been added and are run for all implementations in the `primitives`
-benchmark of `rosenpass-ciphers`. Run the benchmarks using
-
-```
-cargo bench -p rosenpass-ciphers --bench primitives -F bench
-```
-
-Note that the `bench` feature enables the inclusion of the libcrux-backed
-trait implementations in the module tree, but does not enable them
-as default.
-
----
-
 # Rosenpass README
 
 ![Nix](https://github.com/rosenpass/rosenpass/actions/workflows/nix.yaml/badge.svg)
@@ -117,6 +83,45 @@ Rosenpass is also available as prebuilt Docker images:
 
 For details on how to use these images, refer to the [Docker usage guide](docker/USAGE.md).
 
+## Benchmarks
+
+This repository contains facilities for benchmarking both the Rosenpass
+protocol code and the implementations of the cryptographic primitives used
+by it. The primitives are benchmarked using criterion. For the protocol code
+benchmarks we use a library for instrumenting the code such that events are
+written to a trace, which is then inspected after a run.
+
+Benchmarks are automatically run on CI. The measurements are visualized in the
+[Benchmark Dashboard].
+
+[Benchmark Dashboard]: https://rosenpass.github.io/benchmarks
+ 
+### Primitive Benchmarks
+
+There are benchmarks for the functions of the traits `Kem`, `Aead` and
+`KeyedHash`. They are run for all implementations in the `primitives`
+benchmark of `rosenpass-ciphers`. Run the benchmarks using
+
+```
+cargo bench -p rosenpass-ciphers --bench primitives -F bench
+```
+
+Note that the `bench` feature enables the inclusion of the libcrux-backed
+trait implementations in the module tree, but does not enable them
+as default.
+
+### Protocol Benchmarks
+
+The trace that is being written to lives in a new module
+`trace_bench` in the util crate. A basic benchmark that
+performs some minor statistical analysis of the trace can be run using
+
+```
+cargo bench -p rosenpass --bench trace_handshake -F trace_bench
+```
+
+---
+
 # Mirrors
 
 Don't want to use GitHub or only have an IPv6 connection? Rosenpass has set up two mirrors for this:
diff --git a/rosenpass/benches/trace_handshake.rs b/rosenpass/benches/trace_handshake.rs
index 9adaafe..a8f6d16 100644
--- a/rosenpass/benches/trace_handshake.rs
+++ b/rosenpass/benches/trace_handshake.rs
@@ -183,8 +183,8 @@ impl std::fmt::Display for RunTimeGroup {
         let txt = match self {
             RunTimeGroup::Long => "long",
             RunTimeGroup::Medium => "medium",
-            RunTimeGroup::BelowMillisec => "below_ms",
-            RunTimeGroup::BelowMicrosec => "below_us",
+            RunTimeGroup::BelowMillisec => "below 1ms",
+            RunTimeGroup::BelowMicrosec => "below 1us",
         };
         write!(f, "{txt}")
     }
@@ -357,7 +357,7 @@ impl AggregateStat<Duration> {
         // Format the JSON string using measured values and environment constants
         writeln!(
             w,
-            r#"{{"name":"{name}", "unit":"ns/iter", "value":"{value}", "range":"± {range}", "protocol version":"{protocol_version}", "sample size":"{sample_size}", "os":"{os}", "arch":"{arch}", "run time":"{run_time}"}}"#,
+            r#"{{"name":"{name}", "unit":"ns/iter", "value":"{value}", "range":"± {range}", "protocol version":"{protocol_version}", "sample size":"{sample_size}", "operating system":"{os}", "architecture":"{arch}", "run time":"{run_time}"}}"#,
             name = label,                          // Benchmark name
             value = self.mean_duration.as_nanos(), // Mean duration in nanoseconds
             range = self.sd_duration.as_nanos(),   // Standard deviation in nanoseconds

From 7fc6fd2f52e05bf9e644d1d7463a654196b9db94 Mon Sep 17 00:00:00 2001
From: "Jan Winkelmann (keks)" <jan@cryspen.com>
Date: Tue, 27 May 2025 18:20:45 +0200
Subject: [PATCH 05/10] format readme

---
 readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/readme.md b/readme.md
index c13782b..655a563 100644
--- a/readme.md
+++ b/readme.md
@@ -95,7 +95,7 @@ Benchmarks are automatically run on CI. The measurements are visualized in the
 [Benchmark Dashboard].
 
 [Benchmark Dashboard]: https://rosenpass.github.io/benchmarks
- 
+
 ### Primitive Benchmarks
 
 There are benchmarks for the functions of the traits `Kem`, `Aead` and

From 5106ffd549186a404f815f1268021866d52f11b3 Mon Sep 17 00:00:00 2001
From: "Jan Winkelmann (keks)" <jan@cryspen.com>
Date: Tue, 27 May 2025 18:30:24 +0200
Subject: [PATCH 06/10] strictly format attr macros

---
 rosenpass/src/protocol/protocol.rs | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/rosenpass/src/protocol/protocol.rs b/rosenpass/src/protocol/protocol.rs
index 5a5e7c5..bbc7920 100644
--- a/rosenpass/src/protocol/protocol.rs
+++ b/rosenpass/src/protocol/protocol.rs
@@ -3565,7 +3565,10 @@ impl CryptoServer {
     /// on the initiator side, producing the InitHello message.
     #[cfg_attr(
         feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span("handle_initiation", rosenpass_util::trace_bench::TRACE)
+        rosenpass_util::trace_bench::trace_span(
+            "handle_initiation",
+            rosenpass_util::trace_bench::TRACE
+        )
     )]
     pub fn handle_initiation(&mut self, peer: PeerPtr, ih: &mut InitHello) -> Result<PeerPtr> {
         let mut hs = InitiatorHandshake::zero_with_timestamp(
@@ -3632,7 +3635,10 @@ impl CryptoServer {
     /// [RespHello] message on the responder side.
     #[cfg_attr(
         feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span("handle_init_hello", rosenpass_util::trace_bench::TRACE)
+        rosenpass_util::trace_bench::trace_span(
+            "handle_init_hello",
+            rosenpass_util::trace_bench::TRACE
+        )
     )]
     pub fn handle_init_hello(
         &mut self,
@@ -3717,7 +3723,10 @@ impl CryptoServer {
     /// [InitConf] message on the initiator side.
     #[cfg_attr(
         feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span("handle_resp_hello", rosenpass_util::trace_bench::TRACE)
+        rosenpass_util::trace_bench::trace_span(
+            "handle_resp_hello",
+            rosenpass_util::trace_bench::TRACE
+        )
     )]
     pub fn handle_resp_hello(&mut self, rh: &RespHello, ic: &mut InitConf) -> Result<PeerPtr> {
         // RHI2
@@ -3837,7 +3846,10 @@ impl CryptoServer {
     /// an acknowledgement message telling the initiator to stop performing retransmissions.
     #[cfg_attr(
         feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span("handle_init_conf", rosenpass_util::trace_bench::TRACE)
+        rosenpass_util::trace_bench::trace_span(
+            "handle_init_conf",
+            rosenpass_util::trace_bench::TRACE
+        )
     )]
     pub fn handle_init_conf(
         &mut self,
@@ -3946,7 +3958,10 @@ impl CryptoServer {
     /// The EmptyData message is just there to tell the initiator to abort retransmissions.
     #[cfg_attr(
         feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span("handle_resp_conf", rosenpass_util::trace_bench::TRACE)
+        rosenpass_util::trace_bench::trace_span(
+            "handle_resp_conf",
+            rosenpass_util::trace_bench::TRACE
+        )
     )]
     pub fn handle_resp_conf(
         &mut self,
@@ -4007,7 +4022,10 @@ impl CryptoServer {
     /// See more on DOS mitigation in Rosenpass in the [whitepaper](https://rosenpass.eu/whitepaper.pdf).
     #[cfg_attr(
         feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span("handle_cookie_reply", rosenpass_util::trace_bench::TRACE)
+        rosenpass_util::trace_bench::trace_span(
+            "handle_cookie_reply",
+            rosenpass_util::trace_bench::TRACE
+        )
     )]
     pub fn handle_cookie_reply(&mut self, cr: &CookieReply) -> Result<PeerPtr> {
         let peer_ptr: Option<PeerPtr> = self

From 9cc7a58ee7e12428caef54acd60bb8e4cea1ca0c Mon Sep 17 00:00:00 2001
From: "Jan Winkelmann (keks)" <jan@cryspen.com>
Date: Tue, 3 Jun 2025 10:05:47 +0200
Subject: [PATCH 07/10] Set adequate permissions to push benchmarks

---
 .github/workflows/bench-primitives.yml | 3 +++
 .github/workflows/bench-protocol.yml   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/.github/workflows/bench-primitives.yml b/.github/workflows/bench-primitives.yml
index d35855c..dc11cdd 100644
--- a/.github/workflows/bench-primitives.yml
+++ b/.github/workflows/bench-primitives.yml
@@ -1,5 +1,8 @@
 name: rosenpass-ciphers - primitives - benchmark
 
+permissions:
+  contents: write
+
 on:
   pull_request:
   push:
diff --git a/.github/workflows/bench-protocol.yml b/.github/workflows/bench-protocol.yml
index f531651..139e6f6 100644
--- a/.github/workflows/bench-protocol.yml
+++ b/.github/workflows/bench-protocol.yml
@@ -1,5 +1,8 @@
 name: rosenpass - protocol - benchmark
 
+permissions:
+  contents: write
+
 on:
   pull_request:
   push:

From 73df0ceca7818533631931199abddcb57d10a571 Mon Sep 17 00:00:00 2001
From: "Jan Winkelmann (keks)" <jan@cryspen.com>
Date: Wed, 4 Jun 2025 10:55:43 +0200
Subject: [PATCH 08/10] Address feedback

---
 ciphers/benches/primitives.rs        | 36 +++++++++++++++++++++++++++-
 rosenpass/benches/trace_handshake.rs | 23 ++++++++++--------
 rosenpass/src/protocol/protocol.rs   |  2 +-
 util/src/trace_bench.rs              |  2 +-
 4 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/ciphers/benches/primitives.rs b/ciphers/benches/primitives.rs
index 61a649b..f1ef4d7 100644
--- a/ciphers/benches/primitives.rs
+++ b/ciphers/benches/primitives.rs
@@ -194,6 +194,33 @@ mod aead {
         let nonce = [23; NONCE_LEN];
         let ad = [];
 
+        c.bench_function(&aead_benchid("encrypt", "0byte"), |bench| {
+            const DATA_LEN: usize = 0;
+
+            let ptxt = [];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+
+            bench.iter(|| {
+                scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+            });
+        });
+
+        c.bench_function(&aead_benchid("decrypt", "0byte"), |bench| {
+            const DATA_LEN: usize = 0;
+
+            let ptxt = [];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+            let mut ptxt_out = [0u8; DATA_LEN];
+
+            scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+
+            bench.iter(|| {
+                scheme
+                    .decrypt(&mut ptxt_out, &key, &nonce, &ad, &mut ctxt)
+                    .unwrap()
+            })
+        });
+
         c.bench_function(&aead_benchid("encrypt", "32byte"), |bench| {
             const DATA_LEN: usize = 32;
 
@@ -312,7 +339,14 @@ mod keyed_hash {
         ];
         let keyedhash_benchid = |len| benchid(KvPairs(&base), KvPairs(&[KvPair("length", len)]));
 
-        c.bench_function(&keyedhash_benchid("32byte"), |bench| {
+        c.bench_function(&keyedhash_benchid("0byte"), |bench| {
+            let bytes = [];
+
+            bench.iter(|| {
+                H::keyed_hash(&key, &bytes, &mut out).unwrap();
+            })
+        })
+        .bench_function(&keyedhash_benchid("32byte"), |bench| {
             let bytes = [34u8; 32];
 
             bench.iter(|| {
diff --git a/rosenpass/benches/trace_handshake.rs b/rosenpass/benches/trace_handshake.rs
index a8f6d16..7710495 100644
--- a/rosenpass/benches/trace_handshake.rs
+++ b/rosenpass/benches/trace_handshake.rs
@@ -1,4 +1,3 @@
-// Standard library imports
 use std::{
     collections::HashMap,
     hint::black_box,
@@ -7,17 +6,18 @@ use std::{
     time::{Duration, Instant},
 };
 
-// External crate imports
 use anyhow::Result;
 use libcrux_test_utils::tracing::{EventType, Trace as _};
-use rosenpass::protocol::{
-    CryptoServer, HandleMsgResult, MsgBuf, PeerPtr, ProtocolVersion, SPk, SSk, SymKey,
-};
+
 use rosenpass_cipher_traits::primitives::Kem;
 use rosenpass_ciphers::StaticKem;
 use rosenpass_secret_memory::secret_policy_try_use_memfd_secrets;
 use rosenpass_util::trace_bench::{RpEventType, TRACE};
 
+use rosenpass::protocol::{
+    CryptoServer, HandleMsgResult, MsgBuf, PeerPtr, ProtocolVersion, SPk, SSk, SymKey,
+};
+
 const ITERATIONS: usize = 100;
 
 fn handle(
@@ -116,8 +116,11 @@ fn main() {
     .expect("error writing json data");
 }
 
-/// Takes a vector of trace events, bins them by label, extracts durations,
-/// filters empty bins, calculates aggregate statistics (mean, std dev), and returns them.
+/// Performs a simple statistical analysis:
+/// - bins trace events by label
+/// - extracts durations of spamns
+/// - filters out empty bins
+/// - calculates aggregate statistics (mean, std dev)
 fn statistical_analysis(trace: Vec<RpEventType>) -> Vec<(&'static str, AggregateStat<Duration>)> {
     bin_events(trace)
         .into_iter()
@@ -132,9 +135,9 @@ fn statistical_analysis(trace: Vec<RpEventType>) -> Vec<(&'static str, Aggregate
 ///
 /// # Arguments
 /// * `w` - The writer to output JSON to (e.g., stdout, file).
-/// * `item_groups` - An iterator producing tuples of (`&'static str`, `II`), where
-///   `II` is itself an iterator producing (`&'static str`, `AggregateStat<Duration>`).
-///   Represents the protocol_version name and the statistics items within that protocol_version.
+/// * `item_groups` - An iterator producing tuples `(version, stats): (&'static str, II)`.
+///    Here `II` is itself an iterator producing `(label, agg_stat): (&'static str, AggregateStat<Duration>)`,
+///    where the label is the label of the span, e.g. "IHI2".
 ///
 /// # Type Parameters
 /// * `W` - A type that implements `std::io::Write`.
diff --git a/rosenpass/src/protocol/protocol.rs b/rosenpass/src/protocol/protocol.rs
index bbc7920..dd95a5c 100644
--- a/rosenpass/src/protocol/protocol.rs
+++ b/rosenpass/src/protocol/protocol.rs
@@ -3548,7 +3548,7 @@ impl CryptoServer {
 
 /// Marks a section of the protocol using the same identifiers as are used in the whitepaper.
 /// When building with the trace benchmarking feature enabled, this also emits span events into the
-/// trace, which allows reconstructing the run times of the individual sections for performace
+/// trace, which allows reconstructing the run times of the individual sections for performance
 /// measurement.
 macro_rules! protocol_section {
     ($label:expr, $body:tt) => {{
diff --git a/util/src/trace_bench.rs b/util/src/trace_bench.rs
index d659dec..5f1528a 100644
--- a/util/src/trace_bench.rs
+++ b/util/src/trace_bench.rs
@@ -13,7 +13,7 @@ pub type RpTrace = tracing::MutexTrace<&'static str, Instant>;
 /// The trace event type used to trace Rosenpass for performance measurement.
 pub type RpEventType = tracing::TraceEvent<&'static str, Instant>;
 
-// Re-export to make functionality availalable and callers don't need to also directly depend on
+// Re-export to make functionality available and callers don't need to also directly depend on
 // [`libcrux_test_utils`].
 pub use libcrux_test_utils::tracing::trace_span;
 pub use tracing::Trace;

From 91707cc430710949f0e97356589b558a74a10bc1 Mon Sep 17 00:00:00 2001
From: "Jan Winkelmann (keks)" <jan@cryspen.com>
Date: Mon, 23 Jun 2025 15:29:04 +0200
Subject: [PATCH 09/10] Address feedback

---
 Cargo.lock                           |  1 -
 ciphers/Cargo.toml                   | 43 ++++++++++--------
 ciphers/src/subtle/libcrux/mod.rs    |  6 +--
 ciphers/src/subtle/mod.rs            |  7 ++-
 readme.md                            |  6 ++-
 rosenpass/benches/trace_handshake.rs | 18 ++++++--
 rosenpass/src/protocol/protocol.rs   | 67 ++++++++++------------------
 util/Cargo.toml                      |  3 +-
 util/src/trace_bench.rs              | 12 +++--
 9 files changed, 82 insertions(+), 81 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b356525..cbe6f62 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2172,7 +2172,6 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "base64ct",
- "lazy_static",
  "libcrux-test-utils",
  "mio",
  "rustix",
diff --git a/ciphers/Cargo.toml b/ciphers/Cargo.toml
index 5a1c375..c20e338 100644
--- a/ciphers/Cargo.toml
+++ b/ciphers/Cargo.toml
@@ -11,27 +11,34 @@ readme = "readme.md"
 rust-version = "1.77.0"
 
 [features]
-experiment_libcrux_all = [
-  "experiment_libcrux_blake2",
-  "experiment_libcrux_chachapoly",
-  "experiment_libcrux_chachapoly_test",
-  "experiment_libcrux_kyber",
-]
-experiment_libcrux_blake2 = ["dep:libcrux-blake2", "dep:thiserror"]
-experiment_libcrux_chachapoly = ["dep:libcrux-chacha20poly1305"]
+# whether the types should be defined
+experiment_libcrux_define_blake2 = ["dep:libcrux-blake2", "dep:thiserror"]
+experiment_libcrux_define_kyber = ["dep:libcrux-ml-kem", "dep:rand"]
+experiment_libcrux_define_chachapoly = ["dep:libcrux-chacha20poly1305"]
+
+# whether the types should be used by default
+experiment_libcrux_blake2 = ["experiment_libcrux_define_blake2"]
+experiment_libcrux_kyber = ["experiment_libcrux_define_kyber"]
+experiment_libcrux_chachapoly = ["experiment_libcrux_define_chachapoly"]
 experiment_libcrux_chachapoly_test = [
-  "experiment_libcrux_chachapoly",
-  "dep:libcrux",
+    "experiment_libcrux_define_chachapoly",
+    "dep:libcrux",
 ]
-experiment_libcrux_kyber = ["dep:libcrux-ml-kem", "dep:rand"]
-bench = [
-  "dep:thiserror",
-  "dep:rand",
-  "dep:libcrux",
-  "dep:libcrux-blake2",
-  "dep:libcrux-ml-kem",
-  "dep:libcrux-chacha20poly1305",
+
+# shorthands
+experiment_libcrux_define_all = [
+    "experiment_libcrux_define_blake2",
+    "experiment_libcrux_define_chachapoly",
+    "experiment_libcrux_define_kyber",
 ]
+experiment_libcrux_all = [
+    "experiment_libcrux_blake2",
+    "experiment_libcrux_chachapoly",
+    "experiment_libcrux_chachapoly_test",
+    "experiment_libcrux_kyber",
+]
+
+bench = ["experiment_libcrux_define_all"]
 
 [[bench]]
 name = "primitives"
diff --git a/ciphers/src/subtle/libcrux/mod.rs b/ciphers/src/subtle/libcrux/mod.rs
index 210bc61..f481e53 100644
--- a/ciphers/src/subtle/libcrux/mod.rs
+++ b/ciphers/src/subtle/libcrux/mod.rs
@@ -4,11 +4,11 @@
 //!
 //! [Github](https://github.com/cryspen/libcrux)
 
-#[cfg(any(feature = "experiment_libcrux_blake2", feature = "bench"))]
+#[cfg(feature = "experiment_libcrux_define_blake2")]
 pub mod blake2b;
 
-#[cfg(any(feature = "experiment_libcrux_chachapoly", feature = "bench"))]
+#[cfg(feature = "experiment_libcrux_define_chachapoly")]
 pub mod chacha20poly1305_ietf;
 
-#[cfg(any(feature = "experiment_libcrux_kyber", feature = "bench"))]
+#[cfg(feature = "experiment_libcrux_define_kyber")]
 pub mod kyber512;
diff --git a/ciphers/src/subtle/mod.rs b/ciphers/src/subtle/mod.rs
index 6221324..6d3083b 100644
--- a/ciphers/src/subtle/mod.rs
+++ b/ciphers/src/subtle/mod.rs
@@ -9,9 +9,8 @@ pub mod custom;
 pub mod rust_crypto;
 
 #[cfg(any(
-    feature = "experiment_libcrux_blake2",
-    feature = "experiment_libcrux_chachapoly",
-    feature = "experiment_libcrux_kyber",
-    feature = "bench"
+    feature = "experiment_libcrux_define_blake2",
+    feature = "experiment_libcrux_define_chachapoly",
+    feature = "experiment_libcrux_define_kyber",
 ))]
 pub mod libcrux;
diff --git a/readme.md b/readme.md
index 655a563..6021361 100644
--- a/readme.md
+++ b/readme.md
@@ -94,13 +94,13 @@ written to a trace, which is then inspected after a run.
 Benchmarks are automatically run on CI. The measurements are visualized in the
 [Benchmark Dashboard].
 
-[Benchmark Dashboard]: https://rosenpass.github.io/benchmarks
+[Benchmark Dashboard]: https://rosenpass.github.io/rosenpass/benchmarks
 
 ### Primitive Benchmarks
 
 There are benchmarks for the functions of the traits `Kem`, `Aead` and
 `KeyedHash`. They are run for all implementations in the `primitives`
-benchmark of `rosenpass-ciphers`. Run the benchmarks using
+benchmark of `rosenpass-ciphers`. Run the benchmarks and view their results using
 
 ```
 cargo bench -p rosenpass-ciphers --bench primitives -F bench
@@ -120,6 +120,8 @@ performs some minor statistical analysis of the trace can be run using
 cargo bench -p rosenpass --bench trace_handshake -F trace_bench
 ```
 
+This runs the benchmarks and prints the results in machine-readable JSON.
+
 ---
 
 # Mirrors
diff --git a/rosenpass/benches/trace_handshake.rs b/rosenpass/benches/trace_handshake.rs
index 7710495..95a7bdb 100644
--- a/rosenpass/benches/trace_handshake.rs
+++ b/rosenpass/benches/trace_handshake.rs
@@ -12,7 +12,7 @@ use libcrux_test_utils::tracing::{EventType, Trace as _};
 use rosenpass_cipher_traits::primitives::Kem;
 use rosenpass_ciphers::StaticKem;
 use rosenpass_secret_memory::secret_policy_try_use_memfd_secrets;
-use rosenpass_util::trace_bench::{RpEventType, TRACE};
+use rosenpass_util::trace_bench::RpEventType;
 
 use rosenpass::protocol::{
     CryptoServer, HandleMsgResult, MsgBuf, PeerPtr, ProtocolVersion, SPk, SSk, SymKey,
@@ -20,6 +20,10 @@ use rosenpass::protocol::{
 
 const ITERATIONS: usize = 100;
 
+/// Performs a full protocol run by processing a message and recursing into handling that message,
+/// until no further response is produced. Returns the keys produce by the two parties.
+///
+/// Ensures that each party produces one of the two keys.
 fn handle(
     tx: &mut CryptoServer,
     msgb: &mut MsgBuf,
@@ -46,6 +50,10 @@ fn handle(
     Ok((txk, rxk.or(xch)))
 }
 
+/// Performs the full handshake by calling `handle` with the correct values, based on just two
+/// `CryptoServer`s.
+///
+/// Ensures that both parties compute the same keys.
 fn hs(ini: &mut CryptoServer, res: &mut CryptoServer) -> Result<()> {
     let (mut inib, mut resb) = (MsgBuf::zero(), MsgBuf::zero());
     let sz = ini.initiate_handshake(PeerPtr(0), &mut *inib)?;
@@ -54,12 +62,14 @@ fn hs(ini: &mut CryptoServer, res: &mut CryptoServer) -> Result<()> {
     Ok(())
 }
 
+/// Generates a new key pair.
 fn keygen() -> Result<(SSk, SPk)> {
     let (mut sk, mut pk) = (SSk::zero(), SPk::zero());
     StaticKem.keygen(sk.secret_mut(), pk.deref_mut())?;
     Ok((sk, pk))
 }
 
+/// Creates two instanves of `CryptoServer`, generating key pairs for each.
 fn make_server_pair(protocol_version: ProtocolVersion) -> Result<(CryptoServer, CryptoServer)> {
     let psk = SymKey::random();
     let ((ska, pka), (skb, pkb)) = (keygen()?, keygen()?);
@@ -73,6 +83,8 @@ fn make_server_pair(protocol_version: ProtocolVersion) -> Result<(CryptoServer,
 }
 
 fn main() {
+    let trace = rosenpass_util::trace_bench::trace();
+
     // Attempt to use memfd_secrets for storing sensitive key material
     secret_policy_try_use_memfd_secrets();
 
@@ -83,7 +95,7 @@ fn main() {
     }
 
     // Emit a marker event to separate V02 and V03 trace sections
-    TRACE.emit_on_the_fly("start-hs-v03");
+    trace.emit_on_the_fly("start-hs-v03");
 
     // Run protocol for V03
     let (mut a_v03, mut b_v03) = make_server_pair(ProtocolVersion::V03).unwrap();
@@ -92,7 +104,7 @@ fn main() {
     }
 
     // Collect the trace events generated during the handshakes
-    let trace: Vec<_> = TRACE.clone().report();
+    let trace: Vec<_> = trace.clone().report();
 
     // Split the trace into V02 and V03 sections based on the marker
     let (trace_v02, trace_v03) = {
diff --git a/rosenpass/src/protocol/protocol.rs b/rosenpass/src/protocol/protocol.rs
index dd95a5c..13df003 100644
--- a/rosenpass/src/protocol/protocol.rs
+++ b/rosenpass/src/protocol/protocol.rs
@@ -17,6 +17,9 @@ use std::{
 
 use anyhow::{bail, ensure, Context, Result};
 
+#[cfg(feature = "trace_bench")]
+use rosenpass_util::trace_bench::Trace as _;
+
 use crate::{hash_domains, msgs::*, RosenpassError};
 use memoffset::span_of;
 use rosenpass_cipher_traits::primitives::{
@@ -3551,9 +3554,9 @@ impl CryptoServer {
 /// trace, which allows reconstructing the run times of the individual sections for performance
 /// measurement.
 macro_rules! protocol_section {
-    ($label:expr, $body:tt) => {{
+    ($label:expr, $body:block) => {{
         #[cfg(feature = "trace_bench")]
-        let _span_raii_handle = rosenpass_util::trace_bench::TRACE.emit_span($label);
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span($label);
 
         #[allow(unused_braces)]
         $body
@@ -3563,14 +3566,10 @@ macro_rules! protocol_section {
 impl CryptoServer {
     /// Core cryptographic protocol implementation: Kicks of the handshake
     /// on the initiator side, producing the InitHello message.
-    #[cfg_attr(
-        feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span(
-            "handle_initiation",
-            rosenpass_util::trace_bench::TRACE
-        )
-    )]
     pub fn handle_initiation(&mut self, peer: PeerPtr, ih: &mut InitHello) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_initiation");
+
         let mut hs = InitiatorHandshake::zero_with_timestamp(
             self,
             peer.get(self).protocol_version.keyed_hash(),
@@ -3633,19 +3632,15 @@ impl CryptoServer {
 
     /// Core cryptographic protocol implementation: Parses an [InitHello] message and produces a
     /// [RespHello] message on the responder side.
-    #[cfg_attr(
-        feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span(
-            "handle_init_hello",
-            rosenpass_util::trace_bench::TRACE
-        )
-    )]
     pub fn handle_init_hello(
         &mut self,
         ih: &InitHello,
         rh: &mut RespHello,
         keyed_hash: KeyedHash,
     ) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_init_hello");
+
         let mut core = HandshakeState::zero(keyed_hash);
 
         core.sidi = SessionId::from_slice(&ih.sidi);
@@ -3721,14 +3716,10 @@ impl CryptoServer {
 
     /// Core cryptographic protocol implementation: Parses an [RespHello] message and produces an
     /// [InitConf] message on the initiator side.
-    #[cfg_attr(
-        feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span(
-            "handle_resp_hello",
-            rosenpass_util::trace_bench::TRACE
-        )
-    )]
     pub fn handle_resp_hello(&mut self, rh: &RespHello, ic: &mut InitConf) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_resp_hello");
+
         // RHI2
         let peer = self
             .lookup_handshake(SessionId::from_slice(&rh.sidi))
@@ -3844,19 +3835,15 @@ impl CryptoServer {
     ///
     /// This concludes the handshake on the cryptographic level; the [EmptyData] message is just
     /// an acknowledgement message telling the initiator to stop performing retransmissions.
-    #[cfg_attr(
-        feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span(
-            "handle_init_conf",
-            rosenpass_util::trace_bench::TRACE
-        )
-    )]
     pub fn handle_init_conf(
         &mut self,
         ic: &InitConf,
         rc: &mut EmptyData,
         keyed_hash: KeyedHash,
     ) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_init_conf");
+
         // (peer, bn) ← LoadBiscuit(InitConf.biscuit)
         // ICR1
         let (peer, biscuit_no, mut core) = protocol_section!("ICR1", {
@@ -3956,18 +3943,14 @@ impl CryptoServer {
     /// message then terminates the handshake.
     ///
     /// The EmptyData message is just there to tell the initiator to abort retransmissions.
-    #[cfg_attr(
-        feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span(
-            "handle_resp_conf",
-            rosenpass_util::trace_bench::TRACE
-        )
-    )]
     pub fn handle_resp_conf(
         &mut self,
         msg_in: &Ref<&[u8], Envelope<EmptyData>>,
         seal_broken: String,
     ) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_resp_conf");
+
         let rc: &EmptyData = &msg_in.payload;
         let sid = SessionId::from_slice(&rc.sid);
         let hs = self
@@ -4020,14 +4003,10 @@ impl CryptoServer {
     /// DOS mitigation features.
     ///
     /// See more on DOS mitigation in Rosenpass in the [whitepaper](https://rosenpass.eu/whitepaper.pdf).
-    #[cfg_attr(
-        feature = "trace_bench",
-        rosenpass_util::trace_bench::trace_span(
-            "handle_cookie_reply",
-            rosenpass_util::trace_bench::TRACE
-        )
-    )]
     pub fn handle_cookie_reply(&mut self, cr: &CookieReply) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_cookie_reply");
+
         let peer_ptr: Option<PeerPtr> = self
             .lookup_session(Public::new(cr.inner.sid))
             .map(|v| PeerPtr(v.0))
diff --git a/util/Cargo.toml b/util/Cargo.toml
index 99d5baf..bdd1ddf 100644
--- a/util/Cargo.toml
+++ b/util/Cargo.toml
@@ -25,8 +25,7 @@ mio = { workspace = true }
 tempfile = { workspace = true }
 uds = { workspace = true, optional = true, features = ["mio_1xx"] }
 libcrux-test-utils = { workspace = true, optional = true }
-lazy_static = { workspace = true, optional = true }
 
 [features]
 experiment_file_descriptor_passing = ["uds"]
-trace_bench = ["dep:libcrux-test-utils", "dep:lazy_static"]
+trace_bench = ["dep:libcrux-test-utils"]
diff --git a/util/src/trace_bench.rs b/util/src/trace_bench.rs
index 5f1528a..367efa6 100644
--- a/util/src/trace_bench.rs
+++ b/util/src/trace_bench.rs
@@ -1,11 +1,10 @@
+use std::sync::OnceLock;
 use std::time::Instant;
 
 use libcrux_test_utils::tracing;
 
-lazy_static::lazy_static! {
-    /// The trace value used in all Rosepass crates.
-    pub static ref TRACE: RpTrace = RpTrace::default();
-}
+/// The trace value used in all Rosepass crates.
+static TRACE: OnceLock<RpTrace> = OnceLock::new();
 
 /// The trace type used to trace Rosenpass for performance measurement.
 pub type RpTrace = tracing::MutexTrace<&'static str, Instant>;
@@ -17,3 +16,8 @@ pub type RpEventType = tracing::TraceEvent<&'static str, Instant>;
 // [`libcrux_test_utils`].
 pub use libcrux_test_utils::tracing::trace_span;
 pub use tracing::Trace;
+
+/// Returns a reference to the trace and lazily initializes it.
+pub fn trace() -> &'static tracing::MutexTrace<&'static str, Instant> {
+    TRACE.get_or_init(tracing::MutexTrace::default)
+}

From 811c1746c1195046f3337c7a6dab2b31b123ecb0 Mon Sep 17 00:00:00 2001
From: "Jan Winkelmann (keks)" <jan@cryspen.com>
Date: Mon, 23 Jun 2025 15:36:11 +0200
Subject: [PATCH 10/10] format Cargo.toml

---
 ciphers/Cargo.toml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ciphers/Cargo.toml b/ciphers/Cargo.toml
index c20e338..7931f47 100644
--- a/ciphers/Cargo.toml
+++ b/ciphers/Cargo.toml
@@ -21,21 +21,21 @@ experiment_libcrux_blake2 = ["experiment_libcrux_define_blake2"]
 experiment_libcrux_kyber = ["experiment_libcrux_define_kyber"]
 experiment_libcrux_chachapoly = ["experiment_libcrux_define_chachapoly"]
 experiment_libcrux_chachapoly_test = [
-    "experiment_libcrux_define_chachapoly",
-    "dep:libcrux",
+  "experiment_libcrux_define_chachapoly",
+  "dep:libcrux",
 ]
 
 # shorthands
 experiment_libcrux_define_all = [
-    "experiment_libcrux_define_blake2",
-    "experiment_libcrux_define_chachapoly",
-    "experiment_libcrux_define_kyber",
+  "experiment_libcrux_define_blake2",
+  "experiment_libcrux_define_chachapoly",
+  "experiment_libcrux_define_kyber",
 ]
 experiment_libcrux_all = [
-    "experiment_libcrux_blake2",
-    "experiment_libcrux_chachapoly",
-    "experiment_libcrux_chachapoly_test",
-    "experiment_libcrux_kyber",
+  "experiment_libcrux_blake2",
+  "experiment_libcrux_chachapoly",
+  "experiment_libcrux_chachapoly_test",
+  "experiment_libcrux_kyber",
 ]
 
 bench = ["experiment_libcrux_define_all"]