diff --git a/.github/workflows/bench-primitives.yml b/.github/workflows/bench-primitives.yml
new file mode 100644
index 0000000..dc11cdd
--- /dev/null
+++ b/.github/workflows/bench-primitives.yml
@@ -0,0 +1,103 @@
+name: rosenpass-ciphers - primitives - benchmark
+
+permissions:
+  contents: write
+
+on:
+  pull_request:
+  push:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  prim-benchmark:
+    strategy:
+      fail-fast: true
+      matrix:
+        system: ["x86_64-linux", "i686-linux"]
+
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - uses: actions/checkout@v4
+
+      # Install nix
+
+      - name: Install Nix
+        uses: cachix/install-nix-action@v27 # A popular action for installing Nix
+        with:
+          extra_nix_config: |
+            experimental-features = nix-command flakes
+            access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
+
+      # Set up environment
+
+      - name: 🛠️ Prepare Benchmark Path
+        env:
+          EVENT_NAME:  ${{ github.event_name }}
+          BRANCH_NAME: ${{ github.ref_name   }}
+          PR_NUMBER:   ${{ github.event.pull_request.number }}
+        run: |
+          case "$EVENT_NAME" in
+          "push")
+            echo "BENCH_PATH=branch/$BRANCH_NAME" >> $GITHUB_ENV
+            ;;
+          "pull_request")
+            echo "BENCH_PATH=pull/$PR_NUMBER" >> $GITHUB_ENV
+            ;;
+          *)
+            echo "don't know benchmark path for event of type $EVENT_NAME, aborting"
+            exit 1
+          esac
+
+      # Benchmarks ...
+
+      - name: 🏃🏻‍♀️ Benchmarks (using Nix as shell)
+        working-directory: ciphers
+        run: nix develop ".#devShells.${{ matrix.system }}.benchmarks" --command cargo bench -F bench --bench primitives --verbose -- --output-format bencher | tee ../bench-primitives.txt
+
+      - name: Extract benchmarks
+        uses: cryspen/benchmark-data-extract-transform@v2
+        with:
+          name: rosenpass-ciphers primitives benchmarks
+          tool: "cargo"
+          os: ${{ matrix.system }}
+          output-file-path: bench-primitives.txt
+          data-out-path: bench-primitives-os.json
+
+      - name: Fix up 'os' label in benchmark data
+        run: jq 'map(with_entries(.key |= if . == "os" then "operating system" else . end))' <bench-primitives-os.json >bench-primitives.json
+
+      - name: Upload benchmarks
+        uses: cryspen/benchmark-upload-and-plot-action@v3
+        with:
+          name: Crypto Primitives Benchmarks
+          group-by: "operating system,primitive,algorithm"
+          schema: "operating system,primitive,algorithm,implementation,operation,length"
+          input-data-path: bench-primitives.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          # NOTE: pushes to current repository
+          gh-repository: github.com/${{ github.repository }}
+          auto-push: true
+          fail-on-alert: true
+          base-path: benchmarks/
+
+  ciphers-primitives-bench-status:
+    if: ${{ always() }}
+    needs: [prim-benchmark]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Successful
+        if: ${{ !(contains(needs.*.result, 'failure')) }}
+        run: exit 0
+      - name: Failing
+        if: ${{ (contains(needs.*.result, 'failure')) }}
+        run: exit 1
diff --git a/.github/workflows/bench-protocol.yml b/.github/workflows/bench-protocol.yml
new file mode 100644
index 0000000..139e6f6
--- /dev/null
+++ b/.github/workflows/bench-protocol.yml
@@ -0,0 +1,90 @@
+name: rosenpass - protocol - benchmark
+
+permissions:
+  contents: write
+
+on:
+  pull_request:
+  push:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  proto-benchmark:
+    strategy:
+      fail-fast: true
+      matrix:
+        system: ["x86_64-linux", "i686-linux"]
+
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - uses: actions/checkout@v4
+
+      # Install nix
+
+      - name: Install Nix
+        uses: cachix/install-nix-action@v27 # A popular action for installing Nix
+        with:
+          extra_nix_config: |
+            experimental-features = nix-command flakes
+            access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
+
+      # Set up environment
+
+      - name: 🛠️ Prepare Benchmark Path
+        env:
+          EVENT_NAME:  ${{ github.event_name }}
+          BRANCH_NAME: ${{ github.ref_name   }}
+          PR_NUMBER:   ${{ github.event.pull_request.number }}
+        run: |
+          case "$EVENT_NAME" in
+          "push")
+            echo "BENCH_PATH=branch/$BRANCH_NAME" >> $GITHUB_ENV
+            ;;
+          "pull_request")
+            echo "BENCH_PATH=pull/$PR_NUMBER" >> $GITHUB_ENV
+            ;;
+          *)
+            echo "don't know benchmark path for event of type $EVENT_NAME, aborting"
+            exit 1
+          esac
+
+      # Benchmarks ...
+
+      - name: 🏃🏻‍♀️ Benchmarks
+        run: nix develop ".#devShells.${{ matrix.system }}.benchmarks" --command cargo bench -p rosenpass --bench trace_handshake -F trace_bench --verbose >bench-protocol.json
+
+      - name: Upload benchmarks
+        uses: cryspen/benchmark-upload-and-plot-action@v3
+        with:
+          name: Protocol Benchmarks
+          group-by: "operating system,architecture,protocol version,run time"
+          schema: "operating system,architecture,protocol version,run time,name"
+          input-data-path: bench-protocol.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          # NOTE: pushes to current repository
+          gh-repository: github.com/${{ github.repository }}
+          auto-push: true
+          fail-on-alert: true
+          base-path: benchmarks/
+
+  ciphers-protocol-bench-status:
+    if: ${{ always() }}
+    needs: [proto-benchmark]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Successful
+        if: ${{ !(contains(needs.*.result, 'failure')) }}
+        run: exit 0
+      - name: Failing
+        if: ${{ (contains(needs.*.result, 'failure')) }}
+        run: exit 1
diff --git a/Cargo.lock b/Cargo.lock
index 98e590d..cbe6f62 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1269,7 +1269,7 @@ version = "0.0.3-pre"
 source = "git+https://github.com/cryspen/libcrux.git?rev=10ce653e9476#10ce653e94761352b657b6cecdcc0c85675813df"
 dependencies = [
  "libcrux-hacl-rs",
- "libcrux-macros",
+ "libcrux-macros 0.0.2",
 ]
 
 [[package]]
@@ -1279,7 +1279,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78d522fb626847390ea4b776c7eca179ecec363c6c4730b61b0c0feb797b8d92"
 dependencies = [
  "libcrux-hacl-rs",
- "libcrux-macros",
+ "libcrux-macros 0.0.2",
  "libcrux-poly1305",
 ]
 
@@ -1299,7 +1299,7 @@ version = "0.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8bba0885296a72555a5d77056c39cc9b04edd9ab1afa3025ef3dbd96220705c"
 dependencies = [
- "libcrux-macros",
+ "libcrux-macros 0.0.2",
 ]
 
 [[package]]
@@ -1321,6 +1321,15 @@ dependencies = [
  "syn 2.0.98",
 ]
 
+[[package]]
+name = "libcrux-macros"
+version = "0.0.3"
+source = "git+https://github.com/cryspen/libcrux.git?rev=0ab6d2dd9c1f#0ab6d2dd9c1f39c82b1125a566d6befb38feea28"
+dependencies = [
+ "quote",
+ "syn 2.0.98",
+]
+
 [[package]]
 name = "libcrux-ml-kem"
 version = "0.0.2-beta.3"
@@ -1350,7 +1359,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "80143d78ae14ab51ceb2c8a9514fb60af6645d42a9c951bc511792c19c974fca"
 dependencies = [
  "libcrux-hacl-rs",
- "libcrux-macros",
+ "libcrux-macros 0.0.2",
 ]
 
 [[package]]
@@ -1364,6 +1373,14 @@ dependencies = [
  "libcrux-platform",
 ]
 
+[[package]]
+name = "libcrux-test-utils"
+version = "0.0.2"
+source = "git+https://github.com/cryspen/libcrux.git?rev=0ab6d2dd9c1f#0ab6d2dd9c1f39c82b1125a566d6befb38feea28"
+dependencies = [
+ "libcrux-macros 0.0.3",
+]
+
 [[package]]
 name = "libfuzzer-sys"
 version = "0.4.9"
@@ -2024,6 +2041,7 @@ dependencies = [
  "hex",
  "hex-literal",
  "home",
+ "libcrux-test-utils",
  "log",
  "memoffset 0.9.1",
  "mio",
@@ -2070,6 +2088,7 @@ dependencies = [
  "anyhow",
  "blake2",
  "chacha20poly1305",
+ "criterion",
  "libcrux",
  "libcrux-blake2",
  "libcrux-chacha20poly1305",
@@ -2153,6 +2172,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "base64ct",
+ "libcrux-test-utils",
  "mio",
  "rustix",
  "static_assertions",
diff --git a/Cargo.toml b/Cargo.toml
index 8583095..368b891 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -73,12 +73,14 @@ libcrux = { version = "0.0.2-pre.2" }
 libcrux-chacha20poly1305 = { version = "0.0.2-beta.3" }
 libcrux-ml-kem = { version = "0.0.2-beta.3" }
 libcrux-blake2 = { git = "https://github.com/cryspen/libcrux.git", rev = "10ce653e9476" }
+libcrux-test-utils = { git = "https://github.com/cryspen/libcrux.git", rev = "0ab6d2dd9c1f" }
 hex-literal = { version = "0.4.1" }
 hex = { version = "0.4.3" }
 heck = { version = "0.5.0" }
 libc = { version = "0.2" }
 uds = { git = "https://github.com/rosenpass/uds" }
 signal-hook = "0.3.17"
+lazy_static = "1.5"
 
 #Dev dependencies
 serial_test = "3.2.0"
diff --git a/ciphers/Cargo.toml b/ciphers/Cargo.toml
index 2e8e5c7..7931f47 100644
--- a/ciphers/Cargo.toml
+++ b/ciphers/Cargo.toml
@@ -11,19 +11,39 @@ readme = "readme.md"
 rust-version = "1.77.0"
 
 [features]
+# whether the types should be defined
+experiment_libcrux_define_blake2 = ["dep:libcrux-blake2", "dep:thiserror"]
+experiment_libcrux_define_kyber = ["dep:libcrux-ml-kem", "dep:rand"]
+experiment_libcrux_define_chachapoly = ["dep:libcrux-chacha20poly1305"]
+
+# whether the types should be used by default
+experiment_libcrux_blake2 = ["experiment_libcrux_define_blake2"]
+experiment_libcrux_kyber = ["experiment_libcrux_define_kyber"]
+experiment_libcrux_chachapoly = ["experiment_libcrux_define_chachapoly"]
+experiment_libcrux_chachapoly_test = [
+  "experiment_libcrux_define_chachapoly",
+  "dep:libcrux",
+]
+
+# shorthands
+experiment_libcrux_define_all = [
+  "experiment_libcrux_define_blake2",
+  "experiment_libcrux_define_chachapoly",
+  "experiment_libcrux_define_kyber",
+]
 experiment_libcrux_all = [
   "experiment_libcrux_blake2",
   "experiment_libcrux_chachapoly",
   "experiment_libcrux_chachapoly_test",
   "experiment_libcrux_kyber",
 ]
-experiment_libcrux_blake2 = ["dep:libcrux-blake2", "dep:thiserror"]
-experiment_libcrux_chachapoly = ["dep:libcrux-chacha20poly1305"]
-experiment_libcrux_chachapoly_test = [
-  "experiment_libcrux_chachapoly",
-  "dep:libcrux",
-]
-experiment_libcrux_kyber = ["dep:libcrux-ml-kem", "dep:rand"]
+
+bench = ["experiment_libcrux_define_all"]
+
+[[bench]]
+name = "primitives"
+harness = false
+required-features = ["bench"]
 
 [dependencies]
 anyhow = { workspace = true }
@@ -50,3 +70,4 @@ libcrux = { workspace = true, optional = true }
 
 [dev-dependencies]
 rand = { workspace = true }
+criterion = { workspace = true }
diff --git a/ciphers/benches/primitives.rs b/ciphers/benches/primitives.rs
new file mode 100644
index 0000000..f1ef4d7
--- /dev/null
+++ b/ciphers/benches/primitives.rs
@@ -0,0 +1,378 @@
+criterion::criterion_main!(keyed_hash::benches, aead::benches, kem::benches);
+
+fn benchid(base: KvPairs, last: KvPairs) -> String {
+    format!("{base},{last}")
+}
+
+#[derive(Clone, Copy, Debug)]
+struct KvPair<'a>(&'a str, &'a str);
+
+impl std::fmt::Display for KvPair<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{k}={v}", k = self.0, v = self.1)
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+struct KvPairs<'a>(&'a [KvPair<'a>]);
+
+impl std::fmt::Display for KvPairs<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self.0.len() {
+            0 => Ok(()),
+            1 => write!(f, "{}", &self.0[0]),
+            _ => {
+                let mut delim = "";
+                for pair in self.0 {
+                    write!(f, "{delim}{pair}")?;
+                    delim = ",";
+                }
+                Ok(())
+            }
+        }
+    }
+}
+
+mod kem {
+    criterion::criterion_group!(
+        benches,
+        bench_kyber512_libcrux,
+        bench_kyber512_oqs,
+        bench_classicmceliece460896_oqs
+    );
+
+    use criterion::Criterion;
+
+    fn bench_classicmceliece460896_oqs(c: &mut Criterion) {
+        template(
+            c,
+            "classicmceliece460896",
+            "oqs",
+            rosenpass_oqs::ClassicMceliece460896,
+        );
+    }
+
+    fn bench_kyber512_libcrux(c: &mut Criterion) {
+        template(
+            c,
+            "kyber512",
+            "libcrux",
+            rosenpass_ciphers::subtle::libcrux::kyber512::Kyber512,
+        );
+    }
+
+    fn bench_kyber512_oqs(c: &mut Criterion) {
+        template(c, "kyber512", "oqs", rosenpass_oqs::Kyber512);
+    }
+
+    use rosenpass_cipher_traits::primitives::Kem;
+
+    fn template<
+        const SK_LEN: usize,
+        const PK_LEN: usize,
+        const CT_LEN: usize,
+        const SHK_LEN: usize,
+        T: Kem<SK_LEN, PK_LEN, CT_LEN, SHK_LEN>,
+    >(
+        c: &mut Criterion,
+        alg_name: &str,
+        impl_name: &str,
+        scheme: T,
+    ) {
+        use super::{benchid, KvPair, KvPairs};
+
+        let base = [
+            KvPair("primitive", "kem"),
+            KvPair("algorithm", alg_name),
+            KvPair("implementation", impl_name),
+            KvPair("length", "-1"),
+        ];
+
+        let kem_benchid = |op| benchid(KvPairs(&base), KvPairs(&[KvPair("operation", op)]));
+
+        c.bench_function(&kem_benchid("keygen"), |bench| {
+            let mut sk = [0; SK_LEN];
+            let mut pk = [0; PK_LEN];
+
+            bench.iter(|| {
+                scheme.keygen(&mut sk, &mut pk).unwrap();
+            });
+        });
+
+        c.bench_function(&kem_benchid("encaps"), |bench| {
+            let mut sk = [0; SK_LEN];
+            let mut pk = [0; PK_LEN];
+            let mut ct = [0; CT_LEN];
+            let mut shk = [0; SHK_LEN];
+
+            scheme.keygen(&mut sk, &mut pk).unwrap();
+
+            bench.iter(|| {
+                scheme.encaps(&mut shk, &mut ct, &pk).unwrap();
+            });
+        });
+
+        c.bench_function(&kem_benchid("decaps"), |bench| {
+            let mut sk = [0; SK_LEN];
+            let mut pk = [0; PK_LEN];
+            let mut ct = [0; CT_LEN];
+            let mut shk = [0; SHK_LEN];
+            let mut shk2 = [0; SHK_LEN];
+
+            scheme.keygen(&mut sk, &mut pk).unwrap();
+            scheme.encaps(&mut shk, &mut ct, &pk).unwrap();
+
+            bench.iter(|| {
+                scheme.decaps(&mut shk2, &sk, &ct).unwrap();
+            });
+        });
+    }
+}
+mod aead {
+    criterion::criterion_group!(
+        benches,
+        bench_chachapoly_libcrux,
+        bench_chachapoly_rustcrypto,
+        bench_xchachapoly_rustcrypto,
+    );
+
+    use criterion::Criterion;
+
+    const KEY_LEN: usize = rosenpass_ciphers::Aead::KEY_LEN;
+    const TAG_LEN: usize = rosenpass_ciphers::Aead::TAG_LEN;
+
+    fn bench_xchachapoly_rustcrypto(c: &mut Criterion) {
+        template(
+            c,
+            "xchacha20poly1305",
+            "rustcrypto",
+            rosenpass_ciphers::subtle::rust_crypto::xchacha20poly1305_ietf::XChaCha20Poly1305,
+        );
+    }
+
+    fn bench_chachapoly_rustcrypto(c: &mut Criterion) {
+        template(
+            c,
+            "chacha20poly1305",
+            "rustcrypto",
+            rosenpass_ciphers::subtle::rust_crypto::chacha20poly1305_ietf::ChaCha20Poly1305,
+        );
+    }
+
+    fn bench_chachapoly_libcrux(c: &mut Criterion) {
+        template(
+            c,
+            "chacha20poly1305",
+            "libcrux",
+            rosenpass_ciphers::subtle::libcrux::chacha20poly1305_ietf::ChaCha20Poly1305,
+        );
+    }
+
+    use rosenpass_cipher_traits::primitives::Aead;
+
+    fn template<const NONCE_LEN: usize, T: Aead<KEY_LEN, NONCE_LEN, TAG_LEN>>(
+        c: &mut Criterion,
+        alg_name: &str,
+        impl_name: &str,
+        scheme: T,
+    ) {
+        use crate::{benchid, KvPair, KvPairs};
+
+        let base = [
+            KvPair("primitive", "aead"),
+            KvPair("algorithm", alg_name),
+            KvPair("implementation", impl_name),
+        ];
+        let aead_benchid = |op, len| {
+            benchid(
+                KvPairs(&base),
+                KvPairs(&[KvPair("operation", op), KvPair("length", len)]),
+            )
+        };
+
+        let key = [12; KEY_LEN];
+        let nonce = [23; NONCE_LEN];
+        let ad = [];
+
+        c.bench_function(&aead_benchid("encrypt", "0byte"), |bench| {
+            const DATA_LEN: usize = 0;
+
+            let ptxt = [];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+
+            bench.iter(|| {
+                scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+            });
+        });
+
+        c.bench_function(&aead_benchid("decrypt", "0byte"), |bench| {
+            const DATA_LEN: usize = 0;
+
+            let ptxt = [];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+            let mut ptxt_out = [0u8; DATA_LEN];
+
+            scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+
+            bench.iter(|| {
+                scheme
+                    .decrypt(&mut ptxt_out, &key, &nonce, &ad, &mut ctxt)
+                    .unwrap()
+            })
+        });
+
+        c.bench_function(&aead_benchid("encrypt", "32byte"), |bench| {
+            const DATA_LEN: usize = 32;
+
+            let ptxt = [34u8; DATA_LEN];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+
+            bench.iter(|| {
+                scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+            });
+        });
+
+        c.bench_function(&aead_benchid("decrypt", "32byte"), |bench| {
+            const DATA_LEN: usize = 32;
+
+            let ptxt = [34u8; DATA_LEN];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+            let mut ptxt_out = [0u8; DATA_LEN];
+
+            scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+
+            bench.iter(|| {
+                scheme
+                    .decrypt(&mut ptxt_out, &key, &nonce, &ad, &mut ctxt)
+                    .unwrap()
+            })
+        });
+
+        c.bench_function(&aead_benchid("encrypt", "1024byte"), |bench| {
+            const DATA_LEN: usize = 1024;
+
+            let ptxt = [34u8; DATA_LEN];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+
+            bench.iter(|| {
+                scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+            });
+        });
+        c.bench_function(&aead_benchid("decrypt", "1024byte"), |bench| {
+            const DATA_LEN: usize = 1024;
+
+            let ptxt = [34u8; DATA_LEN];
+            let mut ctxt = [0; DATA_LEN + TAG_LEN];
+            let mut ptxt_out = [0u8; DATA_LEN];
+
+            scheme.encrypt(&mut ctxt, &key, &nonce, &ad, &ptxt).unwrap();
+
+            bench.iter(|| {
+                scheme
+                    .decrypt(&mut ptxt_out, &key, &nonce, &ad, &mut ctxt)
+                    .unwrap()
+            })
+        });
+    }
+}
+
+mod keyed_hash {
+    criterion::criterion_group!(
+        benches,
+        bench_blake2b_rustcrypto,
+        bench_blake2b_libcrux,
+        bench_shake256_rustcrypto,
+    );
+
+    const KEY_LEN: usize = 32;
+    const HASH_LEN: usize = 32;
+
+    use criterion::Criterion;
+
+    fn bench_shake256_rustcrypto(c: &mut Criterion) {
+        template(
+            c,
+            "shake256",
+            "rustcrypto",
+            &rosenpass_ciphers::subtle::rust_crypto::keyed_shake256::SHAKE256Core,
+        );
+    }
+
+    fn bench_blake2b_rustcrypto(c: &mut Criterion) {
+        template(
+            c,
+            "blake2b",
+            "rustcrypto",
+            &rosenpass_ciphers::subtle::rust_crypto::blake2b::Blake2b,
+        );
+    }
+
+    fn bench_blake2b_libcrux(c: &mut Criterion) {
+        template(
+            c,
+            "blake2b",
+            "libcrux",
+            &rosenpass_ciphers::subtle::libcrux::blake2b::Blake2b,
+        );
+    }
+
+    use rosenpass_cipher_traits::primitives::KeyedHash;
+
+    fn template<H: KeyedHash<KEY_LEN, HASH_LEN>>(
+        c: &mut Criterion,
+        alg_name: &str,
+        impl_name: &str,
+        _: &H,
+    ) where
+        H::Error: std::fmt::Debug,
+    {
+        use crate::{benchid, KvPair, KvPairs};
+
+        let key = [12u8; KEY_LEN];
+        let mut out = [0u8; HASH_LEN];
+
+        let base = [
+            KvPair("primitive", "keyedhash"),
+            KvPair("algorithm", alg_name),
+            KvPair("implementation", impl_name),
+            KvPair("operation", "hash"),
+        ];
+        let keyedhash_benchid = |len| benchid(KvPairs(&base), KvPairs(&[KvPair("length", len)]));
+
+        c.bench_function(&keyedhash_benchid("0byte"), |bench| {
+            let bytes = [];
+
+            bench.iter(|| {
+                H::keyed_hash(&key, &bytes, &mut out).unwrap();
+            })
+        })
+        .bench_function(&keyedhash_benchid("32byte"), |bench| {
+            let bytes = [34u8; 32];
+
+            bench.iter(|| {
+                H::keyed_hash(&key, &bytes, &mut out).unwrap();
+            })
+        })
+        .bench_function(&keyedhash_benchid("64byte"), |bench| {
+            let bytes = [34u8; 64];
+
+            bench.iter(|| {
+                H::keyed_hash(&key, &bytes, &mut out).unwrap();
+            })
+        })
+        .bench_function(&keyedhash_benchid("128byte"), |bench| {
+            let bytes = [34u8; 128];
+
+            bench.iter(|| {
+                H::keyed_hash(&key, &bytes, &mut out).unwrap();
+            })
+        })
+        .bench_function(&keyedhash_benchid("1024byte"), |bench| {
+            let bytes = [34u8; 1024];
+
+            bench.iter(|| {
+                H::keyed_hash(&key, &bytes, &mut out).unwrap();
+            })
+        });
+    }
+}
diff --git a/ciphers/src/subtle/libcrux/mod.rs b/ciphers/src/subtle/libcrux/mod.rs
index 432bd5f..f481e53 100644
--- a/ciphers/src/subtle/libcrux/mod.rs
+++ b/ciphers/src/subtle/libcrux/mod.rs
@@ -4,11 +4,11 @@
 //!
 //! [Github](https://github.com/cryspen/libcrux)
 
-#[cfg(feature = "experiment_libcrux_blake2")]
+#[cfg(feature = "experiment_libcrux_define_blake2")]
 pub mod blake2b;
 
-#[cfg(feature = "experiment_libcrux_chachapoly")]
+#[cfg(feature = "experiment_libcrux_define_chachapoly")]
 pub mod chacha20poly1305_ietf;
 
-#[cfg(feature = "experiment_libcrux_kyber")]
+#[cfg(feature = "experiment_libcrux_define_kyber")]
 pub mod kyber512;
diff --git a/ciphers/src/subtle/mod.rs b/ciphers/src/subtle/mod.rs
index b3c3aa8..6d3083b 100644
--- a/ciphers/src/subtle/mod.rs
+++ b/ciphers/src/subtle/mod.rs
@@ -9,8 +9,8 @@ pub mod custom;
 pub mod rust_crypto;
 
 #[cfg(any(
-    feature = "experiment_libcrux_blake2",
-    feature = "experiment_libcrux_chachapoly",
-    feature = "experiment_libcrux_kyber"
+    feature = "experiment_libcrux_define_blake2",
+    feature = "experiment_libcrux_define_chachapoly",
+    feature = "experiment_libcrux_define_kyber",
 ))]
 pub mod libcrux;
diff --git a/flake.nix b/flake.nix
index 68ae590..9e2a935 100644
--- a/flake.nix
+++ b/flake.nix
@@ -89,6 +89,7 @@
         [
           "x86_64-linux"
           "aarch64-linux"
+          "i686-linux"
         ]
         (
           system:
@@ -172,6 +173,14 @@
                 inherit (pkgs.cargo-llvm-cov) LLVM_COV LLVM_PROFDATA;
               };
             };
+            devShells.benchmarks = pkgs.mkShell {
+              inputsFrom = [ pkgs.rosenpass ];
+              nativeBuildInputs = with pkgs; [
+                cargo-release
+                clippy
+                rustfmt
+              ];
+            };
 
             checks =
               {
diff --git a/pkgs/rosenpass.nix b/pkgs/rosenpass.nix
index 30f560f..c25809f 100644
--- a/pkgs/rosenpass.nix
+++ b/pkgs/rosenpass.nix
@@ -79,6 +79,7 @@ rustPlatform.buildRustPackage {
       "memsec-0.6.3" = "sha256-4ri+IEqLd77cLcul3lZrmpDKj4cwuYJ8oPRAiQNGeLw=";
       "uds-0.4.2" = "sha256-qlxr/iJt2AV4WryePIvqm/8/MK/iqtzegztNliR93W8=";
       "libcrux-blake2-0.0.3-pre" = "sha256-0CLjuzwJqGooiODOHf5D8Hc8ClcG/XcGvVGyOVnLmJY=";
+      "libcrux-macros-0.0.3" = "sha256-Tb5uRirwhRhoFEK8uu1LvXl89h++40pxzZ+7kXe8RAI=";
     };
   };
 
diff --git a/readme.md b/readme.md
index aa4c881..6021361 100644
--- a/readme.md
+++ b/readme.md
@@ -14,7 +14,7 @@ This repository contains
 
 ## Getting started
 
-First, [install rosenpass](#Getting-Rosenpass). Then, check out the help functions of `rp` & `rosenpass`:
+First, [install rosenpass](#getting-rosenpass). Then, check out the help functions of `rp` & `rosenpass`:
 
 ```sh
 rp help
@@ -64,11 +64,7 @@ The analysis is implemented according to modern software engineering principles:
 The code uses a variety of optimizations to speed up analysis such as using secret functions to model trusted/malicious setup. We split the model into two separate entry points which can be analyzed in parallel. Each is much faster than both models combined.
 A wrapper script provides instant feedback about which queries execute as expected in color: A red cross if a query fails and a green check if it succeeds.
 
-[^liboqs]: https://openquantumsafe.org/liboqs/
-[^wg]: https://www.wireguard.com/
-[^pqwg]: https://eprint.iacr.org/2020/379
-[^pqwg-statedis]: Unless supplied with a pre-shared-key, but this defeats the purpose of a key exchange protocol
-[^wg-statedis]: https://lists.zx2c4.com/pipermail/wireguard/2021-August/006916.htmlA
+[^liboqs]: <https://openquantumsafe.org/liboqs/>
 
 # Getting Rosenpass
 
@@ -87,6 +83,47 @@ Rosenpass is also available as prebuilt Docker images:
 
 For details on how to use these images, refer to the [Docker usage guide](docker/USAGE.md).
 
+## Benchmarks
+
+This repository contains facilities for benchmarking both the Rosenpass
+protocol code and the implementations of the cryptographic primitives used
+by it. The primitives are benchmarked using criterion. For the protocol code
+benchmarks we use a library for instrumenting the code such that events are
+written to a trace, which is then inspected after a run.
+
+Benchmarks are automatically run on CI. The measurements are visualized in the
+[Benchmark Dashboard].
+
+[Benchmark Dashboard]: https://rosenpass.github.io/rosenpass/benchmarks
+
+### Primitive Benchmarks
+
+There are benchmarks for the functions of the traits `Kem`, `Aead` and
+`KeyedHash`. They are run for all implementations in the `primitives`
+benchmark of `rosenpass-ciphers`. Run the benchmarks and view their results using
+
+```
+cargo bench -p rosenpass-ciphers --bench primitives -F bench
+```
+
+Note that the `bench` feature enables the inclusion of the libcrux-backed
+trait implementations in the module tree, but does not enable them
+as default.
+
+### Protocol Benchmarks
+
+The trace that is being written to lives in a new module
+`trace_bench` in the util crate. A basic benchmark that
+performs some minor statistical analysis of the trace can be run using
+
+```
+cargo bench -p rosenpass --bench trace_handshake -F trace_bench
+```
+
+This runs the benchmarks and prints the results in machine-readable JSON.
+
+---
+
 # Mirrors
 
 Don't want to use GitHub or only have an IPv6 connection? Rosenpass has set up two mirrors for this:
diff --git a/rosenpass/Cargo.toml b/rosenpass/Cargo.toml
index 2975a1f..c23d1f3 100644
--- a/rosenpass/Cargo.toml
+++ b/rosenpass/Cargo.toml
@@ -35,6 +35,11 @@ required-features = [
   "internal_bin_gen_ipc_msg_types",
 ]
 
+[[bench]]
+name = "trace_handshake"
+harness = false
+required-features = ["trace_bench"]
+
 [[bench]]
 name = "handshake"
 harness = false
@@ -72,6 +77,7 @@ command-fds = { workspace = true, optional = true }
 rustix = { workspace = true, optional = true }
 uds = { workspace = true, optional = true, features = ["mio_1xx"] }
 signal-hook = { workspace = true, optional = true }
+libcrux-test-utils = { workspace = true, optional = true }
 
 [build-dependencies]
 anyhow = { workspace = true }
@@ -106,6 +112,7 @@ experiment_api = [
 internal_signal_handling_for_coverage_reports = ["signal-hook"]
 internal_testing = []
 internal_bin_gen_ipc_msg_types = ["hex", "heck"]
+trace_bench = ["rosenpass-util/trace_bench", "dep:libcrux-test-utils"]
 
 [lints.rust]
 unexpected_cfgs = { level = "allow", check-cfg = ['cfg(coverage)'] }
diff --git a/rosenpass/benches/trace_handshake.rs b/rosenpass/benches/trace_handshake.rs
new file mode 100644
index 0000000..95a7bdb
--- /dev/null
+++ b/rosenpass/benches/trace_handshake.rs
@@ -0,0 +1,386 @@
+use std::{
+    collections::HashMap,
+    hint::black_box,
+    io::{self, Write},
+    ops::DerefMut,
+    time::{Duration, Instant},
+};
+
+use anyhow::Result;
+use libcrux_test_utils::tracing::{EventType, Trace as _};
+
+use rosenpass_cipher_traits::primitives::Kem;
+use rosenpass_ciphers::StaticKem;
+use rosenpass_secret_memory::secret_policy_try_use_memfd_secrets;
+use rosenpass_util::trace_bench::RpEventType;
+
+use rosenpass::protocol::{
+    CryptoServer, HandleMsgResult, MsgBuf, PeerPtr, ProtocolVersion, SPk, SSk, SymKey,
+};
+
+const ITERATIONS: usize = 100;
+
+/// Performs a full protocol run by processing a message and recursing into handling that message,
+/// until no further response is produced. Returns the keys produce by the two parties.
+///
+/// Ensures that each party produces one of the two keys.
+fn handle(
+    tx: &mut CryptoServer,
+    msgb: &mut MsgBuf,
+    msgl: usize,
+    rx: &mut CryptoServer,
+    resb: &mut MsgBuf,
+) -> Result<(Option<SymKey>, Option<SymKey>)> {
+    let HandleMsgResult {
+        exchanged_with: xch,
+        resp,
+    } = rx.handle_msg(&msgb[..msgl], &mut **resb)?;
+
+    assert!(matches!(xch, None | Some(PeerPtr(0))));
+
+    let xch = xch.map(|p| rx.osk(p).unwrap());
+
+    let (rxk, txk) = resp
+        .map(|resl| handle(rx, resb, resl, tx, msgb))
+        .transpose()?
+        .unwrap_or((None, None));
+
+    assert!(rxk.is_none() || xch.is_none());
+
+    Ok((txk, rxk.or(xch)))
+}
+
+/// Performs the full handshake by calling `handle` with the correct values, based on just two
+/// `CryptoServer`s.
+///
+/// Ensures that both parties compute the same keys.
+fn hs(ini: &mut CryptoServer, res: &mut CryptoServer) -> Result<()> {
+    let (mut inib, mut resb) = (MsgBuf::zero(), MsgBuf::zero());
+    let sz = ini.initiate_handshake(PeerPtr(0), &mut *inib)?;
+    let (kini, kres) = handle(ini, &mut inib, sz, res, &mut resb)?;
+    assert!(kini.unwrap().secret() == kres.unwrap().secret());
+    Ok(())
+}
+
+/// Generates a new key pair.
+fn keygen() -> Result<(SSk, SPk)> {
+    let (mut sk, mut pk) = (SSk::zero(), SPk::zero());
+    StaticKem.keygen(sk.secret_mut(), pk.deref_mut())?;
+    Ok((sk, pk))
+}
+
+/// Creates two instanves of `CryptoServer`, generating key pairs for each.
+fn make_server_pair(protocol_version: ProtocolVersion) -> Result<(CryptoServer, CryptoServer)> {
+    let psk = SymKey::random();
+    let ((ska, pka), (skb, pkb)) = (keygen()?, keygen()?);
+    let (mut a, mut b) = (
+        CryptoServer::new(ska, pka.clone()),
+        CryptoServer::new(skb, pkb.clone()),
+    );
+    a.add_peer(Some(psk.clone()), pkb, protocol_version.clone())?;
+    b.add_peer(Some(psk), pka, protocol_version)?;
+    Ok((a, b))
+}
+
+fn main() {
+    let trace = rosenpass_util::trace_bench::trace();
+
+    // Attempt to use memfd_secrets for storing sensitive key material
+    secret_policy_try_use_memfd_secrets();
+
+    // Run protocol for V02
+    let (mut a_v02, mut b_v02) = make_server_pair(ProtocolVersion::V02).unwrap();
+    for _ in 0..ITERATIONS {
+        hs(black_box(&mut a_v02), black_box(&mut b_v02)).unwrap();
+    }
+
+    // Emit a marker event to separate V02 and V03 trace sections
+    trace.emit_on_the_fly("start-hs-v03");
+
+    // Run protocol for V03
+    let (mut a_v03, mut b_v03) = make_server_pair(ProtocolVersion::V03).unwrap();
+    for _ in 0..ITERATIONS {
+        hs(black_box(&mut a_v03), black_box(&mut b_v03)).unwrap();
+    }
+
+    // Collect the trace events generated during the handshakes
+    let trace: Vec<_> = trace.clone().report();
+
+    // Split the trace into V02 and V03 sections based on the marker
+    let (trace_v02, trace_v03) = {
+        let cutoff = trace
+            .iter()
+            .position(|entry| entry.label == "start-hs-v03")
+            .unwrap();
+        // Exclude the marker itself from the V03 trace
+        let (v02, v03_with_marker) = trace.split_at(cutoff);
+        (v02, &v03_with_marker[1..])
+    };
+
+    // Perform statistical analysis on both trace sections and write results as JSON
+    write_json_arrays(
+        &mut std::io::stdout(), // Write to standard output
+        vec![
+            ("V02", statistical_analysis(trace_v02.to_vec())),
+            ("V03", statistical_analysis(trace_v03.to_vec())),
+        ],
+    )
+    .expect("error writing json data");
+}
+
+/// Performs a simple statistical analysis:
+/// - bins trace events by label
+/// - extracts durations of spamns
+/// - filters out empty bins
+/// - calculates aggregate statistics (mean, std dev)
+fn statistical_analysis(trace: Vec<RpEventType>) -> Vec<(&'static str, AggregateStat<Duration>)> {
+    bin_events(trace)
+        .into_iter()
+        .map(|(label, spans)| (label, extract_span_durations(label, spans.as_slice())))
+        .filter(|(_, durations)| !durations.is_empty())
+        .map(|(label, durations)| (label, AggregateStat::analyze_durations(&durations)))
+        .collect()
+}
+
+/// Takes an iterator of ("protocol_version", iterator_of_stats) pairs and writes them
+/// as a single flat JSON array to the provided writer.
+///
+/// # Arguments
+/// * `w` - The writer to output JSON to (e.g., stdout, file).
+/// * `item_groups` - An iterator producing tuples `(version, stats): (&'static str, II)`.
+///    Here `II` is itself an iterator producing `(label, agg_stat): (&'static str, AggregateStat<Duration>)`,
+///    where the label is the label of the span, e.g. "IHI2".
+///
+/// # Type Parameters
+/// * `W` - A type that implements `std::io::Write`.
+/// * `II` - An iterator type yielding (`&'static str`, `AggregateStat<Duration>`).
+fn write_json_arrays<W: Write, II: IntoIterator<Item = (&'static str, AggregateStat<Duration>)>>(
+    w: &mut W,
+    item_groups: impl IntoIterator<Item = (&'static str, II)>,
+) -> io::Result<()> {
+    // Flatten the groups into a single iterator of (protocol_version, label, stats)
+    let iter = item_groups.into_iter().flat_map(|(version, items)| {
+        items
+            .into_iter()
+            .map(move |(label, agg_stat)| (version, label, agg_stat))
+    });
+    let mut delim = ""; // Start with no delimiter
+
+    // Start the JSON array
+    write!(w, "[")?;
+
+    // Write the flattened statistics as JSON objects, separated by commas.
+    for (version, label, agg_stat) in iter {
+        write!(w, "{delim}")?; // Write delimiter (empty for first item, "," for subsequent)
+        agg_stat.write_json_ns(label, version, w)?; // Write the JSON object for the stat entry
+        delim = ","; // Set delimiter for the next iteration
+    }
+
+    // End the JSON array
+    write!(w, "]")
+}
+
+/// Used to group benchmark results in visualizations
+enum RunTimeGroup {
+    /// For particularly long operations.
+    Long,
+    /// Operations of moderate duration.
+    Medium,
+    /// Operations expected to complete under a millisecond.
+    BelowMillisec,
+    /// Very fast operations, likely under a microsecond.
+    BelowMicrosec,
+}
+
+impl std::fmt::Display for RunTimeGroup {
+    /// Used when writing the group information to JSON output.
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let txt = match self {
+            RunTimeGroup::Long => "long",
+            RunTimeGroup::Medium => "medium",
+            RunTimeGroup::BelowMillisec => "below 1ms",
+            RunTimeGroup::BelowMicrosec => "below 1us",
+        };
+        write!(f, "{txt}")
+    }
+}
+
+/// Maps specific internal timing labels (likely from rosenpass internals)
+/// to the broader SpanGroup categories.
+fn run_time_group(label: &str) -> RunTimeGroup {
+    match label {
+        // Explicitly categorized labels based on expected performance characteristics
+        "handle_init_hello" | "handle_resp_hello" | "RHI5" | "IHR5" => RunTimeGroup::Long,
+        "RHR1" | "IHI2" | "ICR6" => RunTimeGroup::BelowMicrosec,
+        "RHI6" | "ICI7" | "ICR7" | "RHR3" | "ICR3" | "IHR8" | "ICI4" | "RHI3" | "RHI4" | "RHR4"
+        | "RHR7" | "ICI3" | "IHI3" | "IHI8" | "ICR2" | "ICR4" | "IHR4" | "IHR6" | "IHI4"
+        | "RHI7" => RunTimeGroup::BelowMillisec,
+        // Default protocol_version for any other labels
+        _ => RunTimeGroup::Medium,
+    }
+}
+
+/// Used temporarily within `extract_span_durations` to track open spans
+/// and calculated durations.
+#[derive(Debug, Clone)]
+enum StatEntry {
+    /// Represents an unmatched SpanOpen event with its timestamp.
+    Start(Instant),
+    /// Represents a completed span with its calculated duration.
+    Duration(Duration),
+}
+
+/// Takes a flat list of events and organizes them into a HashMap where keys
+/// are event labels and values are vectors of events with that label.
+fn bin_events(events: Vec<RpEventType>) -> HashMap<&'static str, Vec<RpEventType>> {
+    let mut spans = HashMap::<_, Vec<_>>::new();
+    for event in events {
+        // Get the vector for the event's label, or create a new one
+        let spans_for_label = spans.entry(event.label).or_default();
+        // Add the event to the vector
+        spans_for_label.push(event);
+    }
+    spans
+}
+
+/// Processes a list of events (assumed to be for the same label), matching
+/// `SpanOpen` and `SpanClose` events to calculate the duration of each span.
+/// It handles potentially interleaved spans correctly.
+fn extract_span_durations(label: &str, events: &[RpEventType]) -> Vec<Duration> {
+    let mut processing_list: Vec<StatEntry> = vec![]; // List to track open spans and final durations
+
+    for entry in events {
+        match &entry.ty {
+            EventType::SpanOpen => {
+                // Record the start time of a new span
+                processing_list.push(StatEntry::Start(entry.at));
+            }
+            EventType::SpanClose => {
+                // Find the most recent unmatched 'Start' entry
+                let start_index = processing_list
+                    .iter()
+                    .rposition(|span| matches!(span, StatEntry::Start(_))); // Find last Start
+
+                match start_index {
+                    Some(index) => {
+                        // Retrieve the start time
+                        let start_time = match processing_list[index] {
+                            StatEntry::Start(t) => t,
+                            _ => unreachable!(), // Should always be Start based on rposition logic
+                        };
+                        // Calculate duration and replace the 'Start' entry with 'Duration'
+                        processing_list[index] = StatEntry::Duration(entry.at - start_time);
+                    }
+                    None => {
+                        // This should not happen with well-formed traces
+                        eprintln!(
+                            "Warning: Found SpanClose without a matching SpanOpen for label '{}': {:?}",
+                            label, entry
+                        );
+                    }
+                }
+            }
+            EventType::OnTheFly => {
+                // Ignore OnTheFly events for duration calculation
+            }
+        }
+    }
+
+    // Collect all calculated durations, reporting any unmatched starts
+    processing_list
+        .into_iter()
+        .filter_map(|span| match span {
+            StatEntry::Start(at) => {
+                // Report error if a span was opened but never closed
+                eprintln!(
+                    "Warning: Unmatched SpanOpen at {:?} for label '{}'",
+                    at, label
+                );
+                None // Discard unmatched starts
+            }
+            StatEntry::Duration(dur) => Some(dur), // Keep calculated durations
+        })
+        .collect()
+}
+
+/// Stores the mean, standard deviation, relative standard deviation (sd/mean),
+/// and the number of samples used for calculation.
+#[derive(Debug)]
+struct AggregateStat<T> {
+    /// Average duration.
+    mean_duration: T,
+    /// Standard deviation of durations.
+    sd_duration: T,
+    /// Standard deviation as a percentage of the mean.
+    sd_by_mean: String,
+    /// Number of duration measurements.
+    sample_size: usize,
+}
+
+impl AggregateStat<Duration> {
+    /// Calculates mean, variance, and standard deviation for a slice of Durations.
+    fn analyze_durations(durations: &[Duration]) -> Self {
+        let sample_size = durations.len();
+        assert!(sample_size > 0, "Cannot analyze empty duration slice");
+
+        // Calculate the sum of durations
+        let sum: Duration = durations.iter().sum();
+        // Calculate the mean duration
+        let mean = sum / (sample_size as u32);
+
+        // Calculate mean in nanoseconds, adding 1 to avoid potential division by zero later
+        // (though highly unlikely with realistic durations)
+        let mean_ns = mean.as_nanos().saturating_add(1);
+
+        // Calculate variance (sum of squared differences from the mean) / N
+        let variance = durations
+            .iter()
+            .map(Duration::as_nanos)
+            .map(|d_ns| d_ns.abs_diff(mean_ns).pow(2)) // (duration_ns - mean_ns)^2
+            .sum::<u128>() // Sum of squares
+            / (sample_size as u128); // Divide by sample size
+
+        // Calculate standard deviation (sqrt of variance)
+        let sd_ns = (variance as f64).sqrt() as u128;
+        let sd = Duration::from_nanos(sd_ns as u64); // Convert back to Duration
+
+        // Calculate relative standard deviation (sd / mean) as a percentage string
+        let sd_rel_permille = (10000 * sd_ns).checked_div(mean_ns).unwrap_or(0); // Calculate sd/mean * 10000
+        let sd_rel_formatted = format!("{}.{:02}%", sd_rel_permille / 100, sd_rel_permille % 100);
+
+        AggregateStat {
+            mean_duration: mean,
+            sd_duration: sd,
+            sd_by_mean: sd_rel_formatted,
+            sample_size,
+        }
+    }
+
+    /// Writes the statistics as a JSON object to the provided writer.
+    /// Includes metadata like label, protocol_version, OS, architecture, and run time group.
+    ///
+    /// # Arguments
+    /// * `label` - The specific benchmark/span label.
+    /// * `protocol_version` - Version of the protocol that is benchmarked.
+    /// * `w` - The output writer (must implement `std::io::Write`).
+    fn write_json_ns(
+        &self,
+        label: &str,
+        protocol_version: &str,
+        w: &mut impl io::Write,
+    ) -> io::Result<()> {
+        // Format the JSON string using measured values and environment constants
+        writeln!(
+            w,
+            r#"{{"name":"{name}", "unit":"ns/iter", "value":"{value}", "range":"± {range}", "protocol version":"{protocol_version}", "sample size":"{sample_size}", "operating system":"{os}", "architecture":"{arch}", "run time":"{run_time}"}}"#,
+            name = label,                          // Benchmark name
+            value = self.mean_duration.as_nanos(), // Mean duration in nanoseconds
+            range = self.sd_duration.as_nanos(),   // Standard deviation in nanoseconds
+            sample_size = self.sample_size,        // Number of samples
+            os = std::env::consts::OS,             // Operating system
+            arch = std::env::consts::ARCH,         // CPU architecture
+            run_time = run_time_group(label),      // Run time group category (long, medium, etc.)
+            protocol_version = protocol_version // Overall protocol_version (e.g., protocol version)
+        )
+    }
+}
diff --git a/rosenpass/src/protocol/protocol.rs b/rosenpass/src/protocol/protocol.rs
index c23a9c8..13df003 100644
--- a/rosenpass/src/protocol/protocol.rs
+++ b/rosenpass/src/protocol/protocol.rs
@@ -16,7 +16,9 @@ use std::{
 };
 
 use anyhow::{bail, ensure, Context, Result};
-use rand::Fill as Randomize;
+
+#[cfg(feature = "trace_bench")]
+use rosenpass_util::trace_bench::Trace as _;
 
 use crate::{hash_domains, msgs::*, RosenpassError};
 use memoffset::span_of;
@@ -3547,47 +3549,80 @@ impl CryptoServer {
     }
 }
 
+/// Marks a section of the protocol using the same identifiers as are used in the whitepaper.
+/// When building with the trace benchmarking feature enabled, this also emits span events into the
+/// trace, which allows reconstructing the run times of the individual sections for performance
+/// measurement.
+macro_rules! protocol_section {
+    ($label:expr, $body:block) => {{
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span($label);
+
+        #[allow(unused_braces)]
+        $body
+    }};
+}
+
 impl CryptoServer {
     /// Core cryptographic protocol implementation: Kicks of the handshake
     /// on the initiator side, producing the InitHello message.
     pub fn handle_initiation(&mut self, peer: PeerPtr, ih: &mut InitHello) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_initiation");
+
         let mut hs = InitiatorHandshake::zero_with_timestamp(
             self,
             peer.get(self).protocol_version.keyed_hash(),
         );
 
         // IHI1
-        hs.core.init(peer.get(self).spkt.deref())?;
+        protocol_section!("IHI1", {
+            hs.core.init(peer.get(self).spkt.deref())?;
+        });
 
         // IHI2
-        hs.core.sidi.randomize();
-        ih.sidi.copy_from_slice(&hs.core.sidi.value);
+        protocol_section!("IHI2", {
+            hs.core.sidi.randomize();
+            ih.sidi.copy_from_slice(&hs.core.sidi.value);
+        });
 
         // IHI3
-        EphemeralKem.keygen(hs.eski.secret_mut(), &mut *hs.epki)?;
-        ih.epki.copy_from_slice(&hs.epki.value);
+        protocol_section!("IHI3", {
+            EphemeralKem.keygen(hs.eski.secret_mut(), &mut *hs.epki)?;
+            ih.epki.copy_from_slice(&hs.epki.value);
+        });
 
         // IHI4
-        hs.core.mix(ih.sidi.as_slice())?.mix(ih.epki.as_slice())?;
+        protocol_section!("IHI4", {
+            hs.core.mix(ih.sidi.as_slice())?.mix(ih.epki.as_slice())?;
+        });
 
         // IHI5
-        hs.core
-            .encaps_and_mix(&StaticKem, &mut ih.sctr, peer.get(self).spkt.deref())?;
+        protocol_section!("IHI5", {
+            hs.core
+                .encaps_and_mix(&StaticKem, &mut ih.sctr, peer.get(self).spkt.deref())?;
+        });
 
         // IHI6
-        hs.core.encrypt_and_mix(
-            ih.pidic.as_mut_slice(),
-            self.pidm(peer.get(self).protocol_version.keyed_hash())?
-                .as_ref(),
-        )?;
+        protocol_section!("IHI6", {
+            hs.core.encrypt_and_mix(
+                ih.pidic.as_mut_slice(),
+                self.pidm(peer.get(self).protocol_version.keyed_hash())?
+                    .as_ref(),
+            )?;
+        });
 
         // IHI7
-        hs.core
-            .mix(self.spkm.deref())?
-            .mix(peer.get(self).psk.secret())?;
+        protocol_section!("IHI7", {
+            hs.core
+                .mix(self.spkm.deref())?
+                .mix(peer.get(self).psk.secret())?;
+        });
 
         // IHI8
-        hs.core.encrypt_and_mix(ih.auth.as_mut_slice(), &[])?;
+        protocol_section!("IHI8", {
+            hs.core.encrypt_and_mix(ih.auth.as_mut_slice(), &[])?;
+        });
 
         // Update the handshake hash last (not changing any state on prior error
         peer.hs().insert(self, hs)?;
@@ -3603,53 +3638,78 @@ impl CryptoServer {
         rh: &mut RespHello,
         keyed_hash: KeyedHash,
     ) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_init_hello");
+
         let mut core = HandshakeState::zero(keyed_hash);
 
         core.sidi = SessionId::from_slice(&ih.sidi);
 
         // IHR1
-        core.init(self.spkm.deref())?;
+        protocol_section!("IHR1", {
+            core.init(self.spkm.deref())?;
+        });
 
         // IHR4
-        core.mix(&ih.sidi)?.mix(&ih.epki)?;
+        protocol_section!("IHR4", {
+            core.mix(&ih.sidi)?.mix(&ih.epki)?;
+        });
 
         // IHR5
-        core.decaps_and_mix(&StaticKem, self.sskm.secret(), self.spkm.deref(), &ih.sctr)?;
+        protocol_section!("IHR5", {
+            core.decaps_and_mix(&StaticKem, self.sskm.secret(), self.spkm.deref(), &ih.sctr)?;
+        });
 
         // IHR6
-        let peer = {
+        let peer = protocol_section!("IHR6", {
             let mut peerid = PeerId::zero();
             core.decrypt_and_mix(&mut *peerid, &ih.pidic)?;
             self.find_peer(peerid)
                 .with_context(|| format!("No such peer {peerid:?}."))?
-        };
+        });
 
         // IHR7
-        core.mix(peer.get(self).spkt.deref())?
-            .mix(peer.get(self).psk.secret())?;
+        protocol_section!("IHR7", {
+            core.mix(peer.get(self).spkt.deref())?
+                .mix(peer.get(self).psk.secret())?;
+        });
 
         // IHR8
-        core.decrypt_and_mix(&mut [0u8; 0], &ih.auth)?;
+        protocol_section!("IHR8", {
+            core.decrypt_and_mix(&mut [0u8; 0], &ih.auth)?;
+        });
 
         // RHR1
-        core.sidr.randomize();
-        rh.sidi.copy_from_slice(core.sidi.as_ref());
-        rh.sidr.copy_from_slice(core.sidr.as_ref());
+        protocol_section!("RHR1", {
+            core.sidr.randomize();
+            rh.sidi.copy_from_slice(core.sidi.as_ref());
+            rh.sidr.copy_from_slice(core.sidr.as_ref());
+        });
 
         // RHR3
-        core.mix(&rh.sidr)?.mix(&rh.sidi)?;
+        protocol_section!("RHR3", {
+            core.mix(&rh.sidr)?.mix(&rh.sidi)?;
+        });
 
         // RHR4
-        core.encaps_and_mix(&EphemeralKem, &mut rh.ecti, &ih.epki)?;
+        protocol_section!("RHR4", {
+            core.encaps_and_mix(&EphemeralKem, &mut rh.ecti, &ih.epki)?;
+        });
 
         // RHR5
-        core.encaps_and_mix(&StaticKem, &mut rh.scti, peer.get(self).spkt.deref())?;
+        protocol_section!("RHR5", {
+            core.encaps_and_mix(&StaticKem, &mut rh.scti, peer.get(self).spkt.deref())?;
+        });
 
         // RHR6
-        core.store_biscuit(self, peer, &mut rh.biscuit)?;
+        protocol_section!("RHR6", {
+            core.store_biscuit(self, peer, &mut rh.biscuit)?;
+        });
 
         // RHR7
-        core.encrypt_and_mix(&mut rh.auth, &[])?;
+        protocol_section!("RHR7", {
+            core.encrypt_and_mix(&mut rh.auth, &[])?;
+        });
 
         Ok(peer)
     }
@@ -3657,6 +3717,9 @@ impl CryptoServer {
     /// Core cryptographic protocol implementation: Parses an [RespHello] message and produces an
     /// [InitConf] message on the initiator side.
     pub fn handle_resp_hello(&mut self, rh: &RespHello, ic: &mut InitConf) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_resp_hello");
+
         // RHI2
         let peer = self
             .lookup_handshake(SessionId::from_slice(&rh.sidi))
@@ -3700,24 +3763,34 @@ impl CryptoServer {
         //       to save us from the repetitive secret unwrapping
 
         // RHI3
-        core.mix(&rh.sidr)?.mix(&rh.sidi)?;
+        protocol_section!("RHI3", {
+            core.mix(&rh.sidr)?.mix(&rh.sidi)?;
+        });
 
         // RHI4
-        core.decaps_and_mix(
-            &EphemeralKem,
-            hs!().eski.secret(),
-            hs!().epki.deref(),
-            &rh.ecti,
-        )?;
+        protocol_section!("RHI4", {
+            core.decaps_and_mix(
+                &EphemeralKem,
+                hs!().eski.secret(),
+                hs!().epki.deref(),
+                &rh.ecti,
+            )?;
+        });
 
         // RHI5
-        core.decaps_and_mix(&StaticKem, self.sskm.secret(), self.spkm.deref(), &rh.scti)?;
+        protocol_section!("RHI5", {
+            core.decaps_and_mix(&StaticKem, self.sskm.secret(), self.spkm.deref(), &rh.scti)?;
+        });
 
         // RHI6
-        core.mix(&rh.biscuit)?;
+        protocol_section!("RHI6", {
+            core.mix(&rh.biscuit)?;
+        });
 
         // RHI7
-        core.decrypt_and_mix(&mut [0u8; 0], &rh.auth)?;
+        protocol_section!("RHI7", {
+            core.decrypt_and_mix(&mut [0u8; 0], &rh.auth)?;
+        });
 
         // TODO: We should just authenticate the entire network package up to the auth
         // tag as a pattern instead of mixing in fields separately
@@ -3726,27 +3799,33 @@ impl CryptoServer {
         ic.sidr.copy_from_slice(&rh.sidr);
 
         // ICI3
-        core.mix(&ic.sidi)?.mix(&ic.sidr)?;
-        ic.biscuit.copy_from_slice(&rh.biscuit);
+        protocol_section!("ICI3", {
+            core.mix(&ic.sidi)?.mix(&ic.sidr)?;
+            ic.biscuit.copy_from_slice(&rh.biscuit);
+        });
 
         // ICI4
-        core.encrypt_and_mix(&mut ic.auth, &[])?;
+        protocol_section!("ICI4", {
+            core.encrypt_and_mix(&mut ic.auth, &[])?;
+        });
 
         // Split() – We move the secrets into the session; we do not
         // delete the InitiatorHandshake, just clear it's secrets because
         // we still need it for InitConf message retransmission to function.
 
         // ICI7
-        peer.session().insert(
-            self,
-            core.enter_live(
+        protocol_section!("ICI7", {
+            peer.session().insert(
                 self,
-                HandshakeRole::Initiator,
-                peer.get(self).protocol_version.keyed_hash(),
-            )?,
-        )?;
-        hs_mut!().core.erase();
-        hs_mut!().next = HandshakeStateMachine::RespConf;
+                core.enter_live(
+                    self,
+                    HandshakeRole::Initiator,
+                    peer.get(self).protocol_version.keyed_hash(),
+                )?,
+            )?;
+            hs_mut!().core.erase();
+            hs_mut!().next = HandshakeStateMachine::RespConf;
+        });
 
         Ok(peer)
     }
@@ -3762,24 +3841,35 @@ impl CryptoServer {
         rc: &mut EmptyData,
         keyed_hash: KeyedHash,
     ) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_init_conf");
+
         // (peer, bn) ← LoadBiscuit(InitConf.biscuit)
         // ICR1
-        let (peer, biscuit_no, mut core) = HandshakeState::load_biscuit(
-            self,
-            &ic.biscuit,
-            SessionId::from_slice(&ic.sidi),
-            SessionId::from_slice(&ic.sidr),
-            keyed_hash,
-        )?;
+        let (peer, biscuit_no, mut core) = protocol_section!("ICR1", {
+            HandshakeState::load_biscuit(
+                self,
+                &ic.biscuit,
+                SessionId::from_slice(&ic.sidi),
+                SessionId::from_slice(&ic.sidr),
+                keyed_hash,
+            )?
+        });
 
         // ICR2
-        core.encrypt_and_mix(&mut [0u8; Aead::TAG_LEN], &[])?;
+        protocol_section!("ICR2", {
+            core.encrypt_and_mix(&mut [0u8; Aead::TAG_LEN], &[])?;
+        });
 
         // ICR3
-        core.mix(&ic.sidi)?.mix(&ic.sidr)?;
+        protocol_section!("ICR3", {
+            core.mix(&ic.sidi)?.mix(&ic.sidr)?;
+        });
 
         // ICR4
-        core.decrypt_and_mix(&mut [0u8; 0], &ic.auth)?;
+        protocol_section!("ICR4", {
+            core.decrypt_and_mix(&mut [0u8; 0], &ic.auth)?;
+        });
 
         // ICR5
         // Defense against replay attacks; implementations may accept
@@ -3791,20 +3881,24 @@ impl CryptoServer {
         );
 
         // ICR6
-        peer.get_mut(self).biscuit_used = biscuit_no;
+        protocol_section!("ICR6", {
+            peer.get_mut(self).biscuit_used = biscuit_no;
+        });
 
         // ICR7
-        peer.session().insert(
-            self,
-            core.enter_live(
+        protocol_section!("ICR7", {
+            peer.session().insert(
                 self,
-                HandshakeRole::Responder,
-                peer.get(self).protocol_version.keyed_hash(),
-            )?,
-        )?;
-        // TODO: This should be part of the protocol specification.
-        // Abort any ongoing handshake from initiator role
-        peer.hs().take(self);
+                core.enter_live(
+                    self,
+                    HandshakeRole::Responder,
+                    peer.get(self).protocol_version.keyed_hash(),
+                )?,
+            )?;
+            // TODO: This should be part of the protocol specification.
+            // Abort any ongoing handshake from initiator role
+            peer.hs().take(self);
+        });
 
         // TODO: Implementing RP should be possible without touching the live session stuff
         // TODO: I fear that this may lead to race conditions; the acknowledgement may be
@@ -3854,6 +3948,9 @@ impl CryptoServer {
         msg_in: &Ref<&[u8], Envelope<EmptyData>>,
         seal_broken: String,
     ) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_resp_conf");
+
         let rc: &EmptyData = &msg_in.payload;
         let sid = SessionId::from_slice(&rc.sid);
         let hs = self
@@ -3907,6 +4004,9 @@ impl CryptoServer {
     ///
     /// See more on DOS mitigation in Rosenpass in the [whitepaper](https://rosenpass.eu/whitepaper.pdf).
     pub fn handle_cookie_reply(&mut self, cr: &CookieReply) -> Result<PeerPtr> {
+        #[cfg(feature = "trace_bench")]
+        let _span_guard = rosenpass_util::trace_bench::trace().emit_span("handle_cookie_reply");
+
         let peer_ptr: Option<PeerPtr> = self
             .lookup_session(Public::new(cr.inner.sid))
             .map(|v| PeerPtr(v.0))
@@ -4030,7 +4130,7 @@ pub mod testutils {
 
 #[cfg(test)]
 mod test {
-    use std::{borrow::BorrowMut, net::SocketAddrV4, ops::DerefMut, thread::sleep, time::Duration};
+    use std::{borrow::BorrowMut, net::SocketAddrV4, ops::DerefMut};
 
     use super::*;
     use serial_test::serial;
diff --git a/util/Cargo.toml b/util/Cargo.toml
index ccc91c8..bdd1ddf 100644
--- a/util/Cargo.toml
+++ b/util/Cargo.toml
@@ -24,6 +24,8 @@ thiserror = { workspace = true }
 mio = { workspace = true }
 tempfile = { workspace = true }
 uds = { workspace = true, optional = true, features = ["mio_1xx"] }
+libcrux-test-utils = { workspace = true, optional = true }
 
 [features]
 experiment_file_descriptor_passing = ["uds"]
+trace_bench = ["dep:libcrux-test-utils"]
diff --git a/util/src/lib.rs b/util/src/lib.rs
index 66d2387..7949a3b 100644
--- a/util/src/lib.rs
+++ b/util/src/lib.rs
@@ -30,6 +30,9 @@ pub mod option;
 pub mod result;
 /// Time and duration utilities.
 pub mod time;
+/// Trace benchmarking utilities
+#[cfg(feature = "trace_bench")]
+pub mod trace_bench;
 /// Type-level numbers and arithmetic.
 pub mod typenum;
 /// Zero-copy serialization utilities.
diff --git a/util/src/trace_bench.rs b/util/src/trace_bench.rs
new file mode 100644
index 0000000..367efa6
--- /dev/null
+++ b/util/src/trace_bench.rs
@@ -0,0 +1,23 @@
+use std::sync::OnceLock;
+use std::time::Instant;
+
+use libcrux_test_utils::tracing;
+
+/// The trace value used in all Rosepass crates.
+static TRACE: OnceLock<RpTrace> = OnceLock::new();
+
+/// The trace type used to trace Rosenpass for performance measurement.
+pub type RpTrace = tracing::MutexTrace<&'static str, Instant>;
+
+/// The trace event type used to trace Rosenpass for performance measurement.
+pub type RpEventType = tracing::TraceEvent<&'static str, Instant>;
+
+// Re-export to make functionality available and callers don't need to also directly depend on
+// [`libcrux_test_utils`].
+pub use libcrux_test_utils::tracing::trace_span;
+pub use tracing::Trace;
+
+/// Returns a reference to the trace and lazily initializes it.
+pub fn trace() -> &'static tracing::MutexTrace<&'static str, Instant> {
+    TRACE.get_or_init(tracing::MutexTrace::default)
+}