Merge pull request #2147 from mandiant/release/v710

bump to v7.1.0
2025-12-07 21:30:35 -08:00 · 2024-06-14 12:56:46 +02:00 · 2024-06-14 09:06:04 +00:00 · 2024-06-14 09:02:02 +00:00 · 2024-06-13 13:24:33 +02:00 · 2024-06-13 13:23:47 +02:00
138 changed files with 2711 additions and 909 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -57,7 +57,7 @@ When we make a significant decision in how we maintain the project and what we c
 we will document it in the [capa issues tracker](https://github.com/mandiant/capa/issues).
 This is the best place review our discussions about what/how/why we do things in the project.
 If you have a question, check to see if it is documented there.
-If it is *not* documented there, or you can't find an answer, please open a issue.
+If it is *not* documented there, or you can't find an answer, please open an issue.
 We'll link to existing issues when appropriate to keep discussions in one place.

 ## How Can I Contribute?
--- a/.github/flake8.ini
+++ b/.github/flake8.ini
@@ -40,4 +40,4 @@ per-file-ignores =

 copyright-check = True
 copyright-min-file-size = 1 
-copyright-regexp = Copyright \(C\) 2023 Mandiant, Inc. All Rights Reserved.
+copyright-regexp = Copyright \(C\) \d{4} Mandiant, Inc. All Rights Reserved.
--- a/.github/mypy/mypy.ini
+++ b/.github/mypy/mypy.ini
@@ -1,8 +1,5 @@
 [mypy]

-[mypy-halo.*]
-ignore_missing_imports = True
-
 [mypy-tqdm.*]
 ignore_missing_imports = True

--- a/.github/pyinstaller/hooks/hook-vivisect.py
+++ b/.github/pyinstaller/hooks/hook-vivisect.py
@@ -24,7 +24,7 @@ excludedimports = [
    "pyqtwebengine",
    # the above are imported by these viv modules.
    # so really, we'd want to exclude these submodules of viv.
-    # but i dont think this works.
+    # but i don't think this works.
    "vqt",
    "vdb.qt",
    "envi.qt",
--- a/.github/pyinstaller/pyinstaller.spec
+++ b/.github/pyinstaller/pyinstaller.spec
@@ -1,10 +1,19 @@
 # -*- mode: python -*-
 # Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
-import os.path
-import subprocess
+import sys

 import wcwidth
+import capa.rules.cache

+from pathlib import Path
+
+# SPECPATH is a global variable which points to .spec file path
+capa_dir = Path(SPECPATH).parent.parent
+rules_dir = capa_dir / 'rules'
+cache_dir = capa_dir / 'cache'
+
+if not capa.rules.cache.generate_rule_cache(rules_dir, cache_dir):
+    sys.exit(-1)

 a = Analysis(
    # when invoking pyinstaller from the project root,
@@ -26,7 +35,7 @@ a = Analysis(
        # so we manually embed the wcwidth resources here.
        #
        # ref: https://stackoverflow.com/a/62278462/87207
-        (os.path.dirname(wcwidth.__file__), "wcwidth"),
+        (Path(wcwidth.__file__).parent, "wcwidth"),
    ],
    # when invoking pyinstaller from the project root,
    # this gets run from the project root.
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -39,11 +39,11 @@ jobs:
            python_version: 3.8
    steps:
      - name: Checkout capa
-        uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
        with:
          submodules: true
      - name: Set up Python ${{ matrix.python_version }}
-        uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0
+        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
        with:
          python-version: ${{ matrix.python_version }}
      - if: matrix.os == 'ubuntu-20.04'
@@ -51,9 +51,9 @@ jobs:
      - name: Upgrade pip, setuptools
        run: python -m pip install --upgrade pip setuptools
      - name: Install capa with build requirements
-        run: pip install -e .[build]
-      - name: Cache the rule set
-        run: python ./scripts/cache-ruleset.py ./rules/ ./cache/
+        run: |
+          pip install -r requirements.txt
+          pip install -e .[build]
      - name: Build standalone executable
        run: pyinstaller --log-level DEBUG .github/pyinstaller/pyinstaller.spec
      - name: Does it run (PE)?
@@ -66,7 +66,7 @@ jobs:
        run: |
          7z e "tests/data/dynamic/cape/v2.2/d46900384c78863420fb3e297d0a2f743cd2b6b3f7f82bf64059a168e07aceb7.json.gz"
          dist/capa -d "d46900384c78863420fb3e297d0a2f743cd2b6b3f7f82bf64059a168e07aceb7.json"
-      - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
+      - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
        with:
          name: ${{ matrix.asset_name }}
          path: dist/${{ matrix.artifact_name }}
@@ -90,7 +90,7 @@ jobs:
            asset_name: windows
    steps:
      - name: Download ${{ matrix.asset_name }}
-        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # v3.0.2
+        uses: actions/download-artifact@eaceaf801fd36c7dee90939fad912460b18a1ffe # v4.1.2
        with:
          name: ${{ matrix.asset_name }}
      - name: Set executable flag
@@ -118,7 +118,7 @@ jobs:
            artifact_name: capa
    steps:
      - name: Download ${{ matrix.asset_name }}
-        uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # v3.0.2
+        uses: actions/download-artifact@eaceaf801fd36c7dee90939fad912460b18a1ffe # v4.1.2
        with:
          name: ${{ matrix.asset_name }}
      - name: Set executable flag
--- a/.github/workflows/changelog.yml
+++ b/.github/workflows/changelog.yml
@@ -7,7 +7,8 @@ on:
  pull_request_target:
    types: [opened, edited, synchronize]

-permissions: read-all
+permissions:
+  pull-requests: write

 jobs:
  check_changelog:
@@ -19,7 +20,7 @@ jobs:
    steps:
    - name: Get changed files
      id: files
-      uses: Ana06/get-changed-files@e0c398b7065a8d84700c471b6afc4116d1ba4e96 # v2.2.0
+      uses: Ana06/get-changed-files@25f79e676e7ea1868813e21465014798211fad8c # v2.3.0
    - name: check changelog updated
      id: changelog_updated
      env:
@@ -29,14 +30,14 @@ jobs:
        echo $FILES | grep -qF 'CHANGELOG.md' || echo $PR_BODY | grep -qiF "$NO_CHANGELOG"
    - name: Reject pull request if no CHANGELOG update
      if: ${{ always() && steps.changelog_updated.outcome == 'failure' }}
-      uses: Ana06/automatic-pull-request-review@0cf4e8a17ba79344ed3fdd7fed6dd0311d08a9d4 # v0.1.0
+      uses: Ana06/automatic-pull-request-review@76aaf9b15b116a54e1da7a28a46f91fe089600bf # v0.2.0
      with:
        repo-token: ${{ secrets.GITHUB_TOKEN }}
        event: REQUEST_CHANGES
        body: "Please add bug fixes, new features, breaking changes and anything else you think is worthwhile mentioning to the `master (unreleased)` section of CHANGELOG.md. If no CHANGELOG update is needed add the following to the PR description: `${{ env.NO_CHANGELOG }}`"
        allow_duplicate: false
    - name: Dismiss previous review if CHANGELOG update
-      uses: Ana06/automatic-pull-request-review@0cf4e8a17ba79344ed3fdd7fed6dd0311d08a9d4 # v0.1.0
+      uses: Ana06/automatic-pull-request-review@76aaf9b15b116a54e1da7a28a46f91fe089600bf # v0.2.0
      with:
        repo-token: ${{ secrets.GITHUB_TOKEN }}
        event: DISMISS
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -17,20 +17,21 @@ jobs:
    permissions:
      id-token: write
    steps:
-      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
      - name: Set up Python
-        uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0
+        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
        with:
          python-version: '3.8'
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
+          pip install -r requirements.txt
          pip install -e .[build]
      - name: build package
        run: |
          python -m build
      - name: upload package artifacts
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
        with:
          path: dist/*
      - name: publish package
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -32,12 +32,12 @@ jobs:

    steps:
      - name: "Checkout code"
-        uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11  # v4.1.1
        with:
          persist-credentials: false

      - name: "Run analysis"
-        uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # v2.0.6
+        uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
        with:
          results_file: results.sarif
          results_format: sarif
@@ -59,7 +59,7 @@ jobs:
      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
      # format to the repository Actions tab.
      - name: "Upload artifact"
-        uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # v3.1.0
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
        with:
          name: SARIF file
          path: results.sarif
@@ -67,6 +67,6 @@ jobs:

      # Upload the results to GitHub's code scanning dashboard.
      - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@807578363a7869ca324a79039e6db9c843e0e100 # v2.1.27
+        uses: github/codeql-action/upload-sarif@8a470fddafa5cbb6266ee11b37ef4d8aae19c571  # v3.24.6
        with:
          sarif_file: results.sarif
--- a/.github/workflows/tag.yml
+++ b/.github/workflows/tag.yml
@@ -12,7 +12,7 @@ jobs:
    runs-on: ubuntu-20.04
    steps:
    - name: Checkout capa-rules
-      uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
      with:
        repository: mandiant/capa-rules
        token: ${{ secrets.CAPA_TOKEN }}
@@ -25,7 +25,7 @@ jobs:
        git tag $name -m "https://github.com/mandiant/capa/releases/$name"
        # TODO update branch name-major=${name%%.*}
    - name: Push tag to capa-rules
-      uses: ad-m/github-push-action@0fafdd62b84042d49ec0cb92d9cac7f7ce4ec79e # master
+      uses: ad-m/github-push-action@d91a481090679876dfc4178fef17f286781251df # v0.8.0
      with:
        repository: mandiant/capa-rules
        github_token: ${{ secrets.CAPA_TOKEN }}
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -17,7 +17,7 @@ jobs:
    runs-on: ubuntu-20.04
    steps:
    - name: Checkout capa
-      uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
    # The sync GH action in capa-rules relies on a single '- *$' in the CHANGELOG file
    - name: Ensure CHANGELOG has '- *$'
      run: |
@@ -28,14 +28,16 @@ jobs:
    runs-on: ubuntu-20.04
    steps:
    - name: Checkout capa
-      uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
    # use latest available python to take advantage of best performance
    - name: Set up Python 3.11
-      uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0
+      uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
      with:
        python-version: "3.11"
    - name: Install dependencies
-      run: pip install -e .[dev]
+      run: |
+        pip install -r requirements.txt
+        pip install -e .[dev,scripts]
    - name: Lint with ruff
      run: pre-commit run ruff
    - name: Lint with isort
@@ -45,21 +47,25 @@ jobs:
    - name: Lint with flake8
      run: pre-commit run flake8 --hook-stage manual
    - name: Check types with mypy
-      run:  pre-commit run mypy --hook-stage manual
+      run: pre-commit run mypy --hook-stage manual
+    - name: Check imports against dependencies
+      run: pre-commit run deptry --hook-stage manual

  rule_linter:
    runs-on: ubuntu-20.04
    steps:
    - name: Checkout capa with submodules
-      uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
      with:
        submodules: recursive
    - name: Set up Python 3.11
-      uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0
+      uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
      with:
        python-version: "3.11"
    - name: Install capa
-      run: pip install -e .[dev]
+      run: |
+        pip install -r requirements.txt
+        pip install -e .[dev,scripts]
    - name: Run rule linter
      run: python scripts/lint.py rules/

@@ -83,18 +89,20 @@ jobs:
            python-version: "3.10"
    steps:
    - name: Checkout capa with submodules
-      uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
      with:
        submodules: recursive
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0
+      uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install pyyaml
      if: matrix.os == 'ubuntu-20.04'
      run: sudo apt-get install -y libyaml-dev
    - name: Install capa
-      run: pip install -e .[dev]
+      run: |
+        pip install -r requirements.txt
+        pip install -e .[dev,scripts]
    - name: Run tests (fast)
      # this set of tests runs about 80% of the cases in 20% of the time,
      # and should catch most errors quickly.
@@ -106,7 +114,7 @@ jobs:
    name: Binary Ninja tests for ${{ matrix.python-version }}
    env:
      BN_SERIAL: ${{ secrets.BN_SERIAL }}
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
    needs: [tests]
    strategy:
      fail-fast: false
@@ -116,12 +124,12 @@ jobs:
    - name: Checkout capa with submodules
      # do only run if BN_SERIAL is available, have to do this in every step, see https://github.com/orgs/community/discussions/26726#discussioncomment-3253118
      if: ${{ env.BN_SERIAL != 0 }}
-      uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
      with:
        submodules: recursive
    - name: Set up Python ${{ matrix.python-version }}
      if: ${{ env.BN_SERIAL != 0 }}
-      uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0
+      uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install pyyaml
@@ -129,7 +137,9 @@ jobs:
      run: sudo apt-get install -y libyaml-dev
    - name: Install capa
      if: ${{ env.BN_SERIAL != 0 }}
-      run: pip install -e .[dev]
+      run: |
+        pip install -r requirements.txt
+        pip install -e .[dev,scripts]
    - name: install Binary Ninja
      if: ${{ env.BN_SERIAL != 0 }}
      run: |
@@ -153,31 +163,23 @@ jobs:
      matrix:
        python-version: ["3.8", "3.11"]
        java-version: ["17"]
-        gradle-version: ["7.3"]
-        ghidra-version: ["10.3"]
-        public-version: ["PUBLIC_20230510"] # for ghidra releases
-        jep-version: ["4.1.1"]
-        ghidrathon-version: ["3.0.0"]
+        ghidra-version: ["11.0.1"]
+        public-version: ["PUBLIC_20240130"] # for ghidra releases
+        ghidrathon-version: ["4.0.0"] 
    steps:
    - name: Checkout capa with submodules
-      uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
      with:
        submodules: true
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0
+      uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
      with:
        python-version: ${{ matrix.python-version }}
    - name: Set up Java ${{ matrix.java-version }}
-      uses: actions/setup-java@5ffc13f4174014e2d4d4572b3d74c3fa61aeb2c2 # v3
+      uses: actions/setup-java@387ac29b308b003ca37ba93a6cab5eb57c8f5f93 # v4.0.0
      with:
        distribution: 'temurin'
        java-version: ${{ matrix.java-version }}
-    - name: Set up Gradle ${{ matrix.gradle-version }} 
-      uses: gradle/gradle-build-action@40b6781dcdec2762ad36556682ac74e31030cfe2 # v2.5.1
-      with:
-        gradle-version: ${{ matrix.gradle-version }}
-    - name: Install Jep ${{ matrix.jep-version }} 
-      run : pip install jep==${{ matrix.jep-version }}
    - name: Install Ghidra ${{ matrix.ghidra-version }} 
      run: |
        mkdir ./.github/ghidra
@@ -186,14 +188,17 @@ jobs:
    - name: Install Ghidrathon
      run : |
        mkdir ./.github/ghidrathon
-        curl -o ./.github/ghidrathon/ghidrathon-${{ matrix.ghidrathon-version }}.zip "https://codeload.github.com/mandiant/Ghidrathon/zip/refs/tags/v${{ matrix.ghidrathon-version }}"
-        unzip .github/ghidrathon/ghidrathon-${{ matrix.ghidrathon-version }}.zip -d .github/ghidrathon/
-        gradle -p ./.github/ghidrathon/Ghidrathon-${{ matrix.ghidrathon-version }}/ -PGHIDRA_INSTALL_DIR=$(pwd)/.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC
-        unzip .github/ghidrathon/Ghidrathon-${{ matrix.ghidrathon-version }}/dist/*.zip -d .github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC/Ghidra/Extensions
+        wget "https://github.com/mandiant/Ghidrathon/releases/download/v${{ matrix.ghidrathon-version }}/Ghidrathon-v${{ matrix.ghidrathon-version}}.zip" -O ./.github/ghidrathon/ghidrathon-v${{ matrix.ghidrathon-version }}.zip
+        unzip .github/ghidrathon/ghidrathon-v${{ matrix.ghidrathon-version }}.zip -d .github/ghidrathon/
+        python -m pip install -r .github/ghidrathon/requirements.txt
+        python .github/ghidrathon/ghidrathon_configure.py $(pwd)/.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC
+        unzip .github/ghidrathon/Ghidrathon-v${{ matrix.ghidrathon-version }}.zip -d .github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC/Ghidra/Extensions
    - name: Install pyyaml
      run: sudo apt-get install -y libyaml-dev
    - name: Install capa
-      run: pip install -e .[dev] 
+      run: |
+        pip install -r requirements.txt
+        pip install -e .[dev,scripts]
    - name: Run tests
      run: | 
        mkdir ./.github/ghidra/project
@@ -201,4 +206,4 @@ jobs:
        cat ../output.log
        exit_code=$(cat ../output.log | grep exit | awk '{print $NF}')
        exit $exit_code
- 
+ 
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -110,6 +110,16 @@ repos:
        always_run: true
        pass_filenames: false

+-   repo: local
+    hooks:
+    -   id: deptry
+        name: deptry
+        stages: [push, manual]
+        language: system
+        entry: deptry .
+        always_run: true
+        pass_filenames: false
+
 -   repo: local
    hooks:
    -   id: pytest-fast
@@ -127,3 +137,4 @@ repos:
        -   "--ignore=tests/test_scripts.py"
        always_run: true
        pass_filenames: false
+
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,24 +4,102 @@

 ### New Features

-
 ### Breaking Changes

-
 ### New Rules (0)

 -

 ### Bug Fixes

-
 ### capa explorer IDA Pro plugin

 ### Development

 ### Raw diffs
- [capa v7.0.1...master](https://github.com/mandiant/capa/compare/v7.0.1...master)
- [capa-rules v7.0.1...master](https://github.com/mandiant/capa-rules/compare/v7.0.1...master)
+- [capa v7.1.0...master](https://github.com/mandiant/capa/compare/v7.1.0...master)
+- [capa-rules v7.1.0...master](https://github.com/mandiant/capa-rules/compare/v7.1.0...master)
+
+## v7.1.0
+The v7.1.0 release brings large performance improvements to capa's rule matching engine.
+Additionally, we've fixed various bugs and added new features for people using and developing capa.
+
+Special thanks to our repeat and new contributors:
+* @sjha2048 made their first contribution in https://github.com/mandiant/capa/pull/2000
+* @Rohit1123 made their first contribution in https://github.com/mandiant/capa/pull/1990
+* @psahithireddy made their first contribution in https://github.com/mandiant/capa/pull/2020
+* @Atlas-64 made their first contribution in https://github.com/mandiant/capa/pull/2018
+* @s-ff made their first contribution in https://github.com/mandiant/capa/pull/2011
+* @samadpls made their first contribution in https://github.com/mandiant/capa/pull/2024
+* @acelynnzhang made their first contribution in https://github.com/mandiant/capa/pull/2044
+* @RainRat made their first contribution in https://github.com/mandiant/capa/pull/2058
+* @ReversingWithMe made their first contribution in https://github.com/mandiant/capa/pull/2093
+* @malwarefrank made their first contribution in https://github.com/mandiant/capa/pull/2037
+
+### New Features
+- Emit "dotnet" as format to ResultDocument when processing .NET files #2024 @samadpls
+- ELF: detect OS from statically-linked Go binaries #1978 @williballenthin
+- add function in capa/helpers to load plain and compressed JSON reports #1883 @Rohit1123
+- document Antivirus warnings and VirusTotal false positive detections #2028 @RionEV @mr-tz
+- Add json to sarif conversion script @reversingwithme
+- render maec/* fields #843 @s-ff
+- replace Halo spinner with Rich #2086 @s-ff
+- optimize rule matching #2080 @williballenthin
+- add aarch64 as a valid architecture #2144 mehunhoff@google.com @williballenthin
+- relax dependency version requirements for the capa library #2053 @williballenthin
+- add scripts dependency group and update documentation #2145 @mr-tz
+
+### New Rules (25)
+
+- impact/wipe-disk/delete-drive-layout-via-ioctl william.ballenthin@mandiant.com
+- host-interaction/driver/interact-with-driver-via-ioctl moritz.raabe@mandiant.com
+- host-interaction/driver/unload-driver moritz.raabe@mandiant.com
+- nursery/get-disk-information-via-ioctl william.ballenthin@mandiant.com
+- nursery/get-volume-information-via-ioctl william.ballenthin@mandiant.com
+- nursery/unmount-volume-via-ioctl william.ballenthin@mandiant.com
+- data-manipulation/encryption/rc4/encrypt-data-using-rc4-via-systemfunction033 daniel.stepanic@elastic.co
+- anti-analysis/anti-forensic/self-deletion/self-delete-using-alternate-data-streams daniel.stepanic@elastic.co
+- nursery/change-memory-permission-on-linux mehunhoff@google.com
+- nursery/check-file-permission-on-linux mehunhoff@google.com
+- nursery/check-if-process-is-running-under-android-emulator-on-android mehunhoff@google.com
+- nursery/map-or-unmap-memory-on-linux mehunhoff@google.com
+- persistence/act-as-share-provider-dll jakub.jozwiak@mandiant.com
+- persistence/act-as-windbg-extension jakub.jozwiak@mandiant.com
+- persistence/act-as-time-provider-dll jakub.jozwiak@mandiant.com
+- host-interaction/gui/window/hide/hide-graphical-window-from-taskbar jakub.jozwiak@mandiant.com
+- compiler/dart/compiled-with-dart jakub.jozwiak@mandiant.com
+- nursery/bypass-hidden-api-restrictions-via-jni-on-android mehunhoff@google.com
+- nursery/get-current-process-filesystem-mounts-on-linux mehunhoff@google.com
+- nursery/get-current-process-memory-mapping-on-linux mehunhoff@google.com
+- nursery/get-system-property-on-android mehunhoff@google.com
+- nursery/hook-routines-via-lsplant mehunhoff@google.com
+- nursery/load-packed-dex-via-jiagu-on-android mehunhoff@google.com
+- nursery/modify-api-blacklist-or-denylist-via-jni-on-android mehunhoff@google.com
+- nursery/truncate-file-on-linux mehunhoff@google.com
+
+### Bug Fixes
+
+- do some imports closer to where they are used #1810 @williballenthin
+- binja: fix and simplify stack string detection code after binja 4.0 @xusheng6
+- binja: add support for forwarded export #1646 @xusheng6
+- cape: support more report formats #2035 @mr-tz
+
+### capa explorer IDA Pro plugin
+- replace deprecated IDA API find_binary with bin_search #1606 @s-ff
+
+### Development
+
+- ci: Fix PR review in the changelog check GH action #2004 @Ana06
+- ci: use rules number badge stored in our bot gist and generated using `schneegans/dynamic-badges-action` #2001 capa-rules#882 @Ana06
+- ci: update github workflows to use latest version of actions that were using a deprecated version of node #1967 #2003 capa-rules#883 @sjha2048 @Ana06
+- ci: update binja version to stable 4.0 #2016 @xusheng6
+- ci: update github workflows to reflect the latest ghidrathon installation and bumped up jep, ghidra versions  #2020 @psahithireddy
+- ci: include rule caching in PyInstaller build process #2097 @s-ff
+- add deptry support #1497 @s-ff
+
+### Raw diffs
+- [capa v7.0.1...v7.1.0](https://github.com/mandiant/capa/compare/v7.0.1...v7.1.0)
+- [capa-rules v7.0.1...v7.1.0](https://github.com/mandiant/capa-rules/compare/v7.0.1...v7.1.0)

 ## v7.0.1

@@ -271,7 +349,7 @@ For those that use capa as a library, we've introduced some limited breaking cha
 - [capa-rules v5.1.0...v6.0.0](https://github.com/mandiant/capa-rules/compare/v5.1.0...v6.0.0)

 ## v5.1.0
-capa version 5.1.0 adds a Protocol Buffers (protobuf) format for result documents. Additionally, the [Vector35](https://vector35.com/) team contributed a new feature extractor using Binary Ninja. Other new features are a new CLI flag to override the detected operating system, functionality to read and render existing result documents, and a output color format that's easier to read.
+capa version 5.1.0 adds a Protocol Buffers (protobuf) format for result documents. Additionally, the [Vector35](https://vector35.com/) team contributed a new feature extractor using Binary Ninja. Other new features are a new CLI flag to override the detected operating system, functionality to read and render existing result documents, and an output color format that's easier to read.

 Over 25 capa rules have been added and improved.

@@ -1470,7 +1548,7 @@ The IDA Pro integration is now distributed as a real plugin, instead of a script
  - updates distributed PyPI/`pip install --upgrade` without touching your `%IDADIR%`
  - generally doing thing the "right way"

-How to get this new version? Its easy: download [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ida/plugin/capa_explorer.py) to your IDA plugins directory and update your capa installation (incidentally, this is a good opportunity to migrate to `pip install flare-capa` instead of git checkouts). Now you should see the plugin listed in the `Edit > Plugins > FLARE capa explorer` menu in IDA. 
+How to get this new version? It's easy: download [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ida/plugin/capa_explorer.py) to your IDA plugins directory and update your capa installation (incidentally, this is a good opportunity to migrate to `pip install flare-capa` instead of git checkouts). Now you should see the plugin listed in the `Edit > Plugins > FLARE capa explorer` menu in IDA. 

 Please refer to the plugin [readme](https://github.com/mandiant/capa/blob/master/capa/ida/plugin/README.md) for additional information on installing and using the IDA Pro plugin.

--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - name: "The FLARE Team"
+title: "capa, a tool to identify capabilities in programs and sandbox traces."
+date-released: 2020-07-16
+url: "https://github.com/mandiant/capa"
+
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -187,7 +187,7 @@
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

-   Copyright (C) 2023 Mandiant, Inc.
+   Copyright (C) 2020 Mandiant, Inc.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@

 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/flare-capa)](https://pypi.org/project/flare-capa)
 [![Last release](https://img.shields.io/github/v/release/mandiant/capa)](https://github.com/mandiant/capa/releases)
-[![Number of rules](https://img.shields.io/badge/rules-866-blue.svg)](https://github.com/mandiant/capa-rules)
+[![Number of rules](https://gist.githubusercontent.com/capa-bot/6d7960e911f48b3b74916df8988cf0f3/raw/rules_badge.svg)](https://github.com/mandiant/capa-rules)
 [![CI status](https://github.com/mandiant/capa/workflows/CI/badge.svg)](https://github.com/mandiant/capa/actions?query=workflow%3ACI+event%3Apush+branch%3Amaster)
 [![Downloads](https://img.shields.io/github/downloads/mandiant/capa/total)](https://github.com/mandiant/capa/releases)
 [![License](https://img.shields.io/badge/license-Apache--2.0-green.svg)](LICENSE.txt)
@@ -126,7 +126,7 @@ function @ 0x4011C0
 ...
 ```

-Additionally, capa also supports analyzing [CAPE](https://github.com/kevoreilly/CAPEv2) sandbox reports for dynamic capabilty extraction.
+Additionally, capa also supports analyzing [CAPE](https://github.com/kevoreilly/CAPEv2) sandbox reports for dynamic capability extraction.
 In order to use this, you first submit your sample to CAPE for analysis, and then run capa against the generated report (JSON).

 Here's an example of running capa against a packed binary, and then running capa against the CAPE report of that binary:
@@ -260,7 +260,9 @@ capa explorer helps you identify interesting areas of a program and build new ca

 ![capa + IDA Pro integration](https://github.com/mandiant/capa/blob/master/doc/img/explorer_expanded.png)

-If you use Ghidra, you can use the Python 3 [Ghidra feature extractor](/capa/ghidra/). This integration enables capa to extract features directly from your Ghidra database, which can help you identify capabilities in programs that you analyze using Ghidra.
+If you use Ghidra, then you can use the [capa + Ghidra integration](/capa/ghidra/) to run capa's analysis directly on your Ghidra database and render the results in Ghidra's user interface.
+
+<img src="https://github.com/mandiant/capa/assets/66766340/eeae33f4-99d4-42dc-a5e8-4c1b8c661492" width=300>

 # further information
 ## capa
--- a/capa/capabilities/dynamic.py
+++ b/capa/capabilities/dynamic.py
@@ -6,6 +6,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import sys
 import logging
 import itertools
 import collections
@@ -65,7 +66,7 @@ def find_thread_capabilities(
    features: FeatureSet = collections.defaultdict(set)

    # matches found at the call scope.
-    # might be found at different calls, thats ok.
+    # might be found at different calls, that's ok.
    call_matches: MatchResults = collections.defaultdict(list)

    for ch in extractor.get_calls(ph, th):
@@ -103,11 +104,11 @@ def find_process_capabilities(
    process_features: FeatureSet = collections.defaultdict(set)

    # matches found at the basic threads.
-    # might be found at different threads, thats ok.
+    # might be found at different threads, that's ok.
    thread_matches: MatchResults = collections.defaultdict(list)

    # matches found at the call scope.
-    # might be found at different calls, thats ok.
+    # might be found at different calls, that's ok.
    call_matches: MatchResults = collections.defaultdict(list)

    for th in extractor.get_threads(ph):
@@ -147,6 +148,11 @@ def find_dynamic_capabilities(
                def pbar(s, *args, **kwargs):
                    return s

+            elif not sys.stderr.isatty():
+                # don't display progress bar when stderr is redirected to a file
+                def pbar(s, *args, **kwargs):
+                    return s
+
            processes = list(extractor.get_processes())

            pb = pbar(processes, desc="matching", unit=" processes", leave=False)
--- a/capa/capabilities/static.py
+++ b/capa/capabilities/static.py
@@ -6,6 +6,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import sys
 import time
 import logging
 import itertools
@@ -66,7 +67,7 @@ def find_basic_block_capabilities(
    features: FeatureSet = collections.defaultdict(set)

    # matches found at the instruction scope.
-    # might be found at different instructions, thats ok.
+    # might be found at different instructions, that's ok.
    insn_matches: MatchResults = collections.defaultdict(list)

    for insn in extractor.get_instructions(f, bb):
@@ -106,11 +107,11 @@ def find_code_capabilities(
    function_features: FeatureSet = collections.defaultdict(set)

    # matches found at the basic block scope.
-    # might be found at different basic blocks, thats ok.
+    # might be found at different basic blocks, that's ok.
    bb_matches: MatchResults = collections.defaultdict(list)

    # matches found at the instruction scope.
-    # might be found at different instructions, thats ok.
+    # might be found at different instructions, that's ok.
    insn_matches: MatchResults = collections.defaultdict(list)

    for bb in extractor.get_basic_blocks(fh):
@@ -156,6 +157,11 @@ def find_static_capabilities(
                def pbar(s, *args, **kwargs):
                    return s

+            elif not sys.stderr.isatty():
+                # don't display progress bar when stderr is redirected to a file
+                def pbar(s, *args, **kwargs):
+                    return s
+
            functions = list(extractor.get_functions())
            n_funcs = len(functions)

@@ -182,9 +188,16 @@ def find_static_capabilities(
                )
                t1 = time.time()

-                match_count = sum(len(res) for res in function_matches.values())
-                match_count += sum(len(res) for res in bb_matches.values())
-                match_count += sum(len(res) for res in insn_matches.values())
+                match_count = 0
+                for name, matches_ in itertools.chain(
+                    function_matches.items(), bb_matches.items(), insn_matches.items()
+                ):
+                    # in practice, most matches are derived rules,
+                    # like "check OS version/5bf4c7f39fd4492cbed0f6dc7d596d49"
+                    # but when we log to the human, they really care about "real" rules.
+                    if not ruleset.rules[name].is_subscope_rule():
+                        match_count += len(matches_)
+
                logger.debug(
                    "analyzed function 0x%x and extracted %d features, %d matches in %0.02fs",
                    f.address,
@@ -213,7 +226,7 @@ def find_static_capabilities(
    all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features)
    feature_counts.file = feature_count

-    matches = dict(
+    matches: MatchResults = dict(
        itertools.chain(
            # each rule exists in exactly one scope,
            # so there won't be any overlap among these following MatchResults,
--- a/capa/engine.py
+++ b/capa/engine.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -102,14 +102,14 @@ class And(Statement):
        super().__init__(description=description)
        self.children = children

-    def evaluate(self, ctx, short_circuit=True):
+    def evaluate(self, features: FeatureSet, short_circuit=True):
        capa.perf.counters["evaluate.feature"] += 1
        capa.perf.counters["evaluate.feature.and"] += 1

        if short_circuit:
            results = []
            for child in self.children:
-                result = child.evaluate(ctx, short_circuit=short_circuit)
+                result = child.evaluate(features, short_circuit=short_circuit)
                results.append(result)
                if not result:
                    # short circuit
@@ -117,7 +117,7 @@ class And(Statement):

            return Result(True, self, results)
        else:
-            results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children]
+            results = [child.evaluate(features, short_circuit=short_circuit) for child in self.children]
            success = all(results)
            return Result(success, self, results)

@@ -135,14 +135,14 @@ class Or(Statement):
        super().__init__(description=description)
        self.children = children

-    def evaluate(self, ctx, short_circuit=True):
+    def evaluate(self, features: FeatureSet, short_circuit=True):
        capa.perf.counters["evaluate.feature"] += 1
        capa.perf.counters["evaluate.feature.or"] += 1

        if short_circuit:
            results = []
            for child in self.children:
-                result = child.evaluate(ctx, short_circuit=short_circuit)
+                result = child.evaluate(features, short_circuit=short_circuit)
                results.append(result)
                if result:
                    # short circuit as soon as we hit one match
@@ -150,7 +150,7 @@ class Or(Statement):

            return Result(False, self, results)
        else:
-            results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children]
+            results = [child.evaluate(features, short_circuit=short_circuit) for child in self.children]
            success = any(results)
            return Result(success, self, results)

@@ -162,11 +162,11 @@ class Not(Statement):
        super().__init__(description=description)
        self.child = child

-    def evaluate(self, ctx, short_circuit=True):
+    def evaluate(self, features: FeatureSet, short_circuit=True):
        capa.perf.counters["evaluate.feature"] += 1
        capa.perf.counters["evaluate.feature.not"] += 1

-        results = [self.child.evaluate(ctx, short_circuit=short_circuit)]
+        results = [self.child.evaluate(features, short_circuit=short_circuit)]
        success = not results[0]
        return Result(success, self, results)

@@ -185,7 +185,7 @@ class Some(Statement):
        self.count = count
        self.children = children

-    def evaluate(self, ctx, short_circuit=True):
+    def evaluate(self, features: FeatureSet, short_circuit=True):
        capa.perf.counters["evaluate.feature"] += 1
        capa.perf.counters["evaluate.feature.some"] += 1

@@ -193,7 +193,7 @@ class Some(Statement):
            results = []
            satisfied_children_count = 0
            for child in self.children:
-                result = child.evaluate(ctx, short_circuit=short_circuit)
+                result = child.evaluate(features, short_circuit=short_circuit)
                results.append(result)
                if result:
                    satisfied_children_count += 1
@@ -204,7 +204,7 @@ class Some(Statement):

            return Result(False, self, results)
        else:
-            results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children]
+            results = [child.evaluate(features, short_circuit=short_circuit) for child in self.children]
            # note that here we cast the child result as a bool
            # because we've overridden `__bool__` above.
            #
@@ -214,7 +214,7 @@ class Some(Statement):


 class Range(Statement):
-    """match if the child is contained in the ctx set with a count in the given range."""
+    """match if the child is contained in the feature set with a count in the given range."""

    def __init__(self, child, min=None, max=None, description=None):
        super().__init__(description=description)
@@ -222,15 +222,15 @@ class Range(Statement):
        self.min = min if min is not None else 0
        self.max = max if max is not None else (1 << 64 - 1)

-    def evaluate(self, ctx, **kwargs):
+    def evaluate(self, features: FeatureSet, short_circuit=True):
        capa.perf.counters["evaluate.feature"] += 1
        capa.perf.counters["evaluate.feature.range"] += 1

-        count = len(ctx.get(self.child, []))
+        count = len(features.get(self.child, []))
        if self.min == 0 and count == 0:
            return Result(True, self, [])

-        return Result(self.min <= count <= self.max, self, [], locations=ctx.get(self.child))
+        return Result(self.min <= count <= self.max, self, [], locations=features.get(self.child))

    def __str__(self):
        if self.max == (1 << 64 - 1):
@@ -250,7 +250,7 @@ class Subscope(Statement):
        self.scope = scope
        self.child = child

-    def evaluate(self, ctx, **kwargs):
+    def evaluate(self, features: FeatureSet, short_circuit=True):
        raise ValueError("cannot evaluate a subscope directly!")


@@ -270,6 +270,14 @@ class Subscope(Statement):
 MatchResults = Mapping[str, List[Tuple[Address, Result]]]


+def get_rule_namespaces(rule: "capa.rules.Rule") -> Iterator[str]:
+    namespace = rule.meta.get("namespace")
+    if namespace:
+        while namespace:
+            yield namespace
+            namespace, _, _ = namespace.rpartition("/")
+
+
 def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations: Iterable[Address]):
    """
    record into the given featureset that the given rule matched at the given locations.
@@ -280,11 +288,8 @@ def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations:
    updates `features` in-place. doesn't modify the remaining arguments.
    """
    features[capa.features.common.MatchedRule(rule.name)].update(locations)
-    namespace = rule.meta.get("namespace")
-    if namespace:
-        while namespace:
-            features[capa.features.common.MatchedRule(namespace)].update(locations)
-            namespace, _, _ = namespace.rpartition("/")
+    for namespace in get_rule_namespaces(rule):
+        features[capa.features.common.MatchedRule(namespace)].update(locations)


 def match(rules: List["capa.rules.Rule"], features: FeatureSet, addr: Address) -> Tuple[FeatureSet, MatchResults]:
--- a/capa/exceptions.py
+++ b/capa/exceptions.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/address.py
+++ b/capa/features/address.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -93,7 +93,7 @@ class ThreadAddress(Address):


 class DynamicCallAddress(Address):
-    """addesses a call in a dynamic execution trace"""
+    """addresses a call in a dynamic execution trace"""

    def __init__(self, thread: ThreadAddress, id: int):
        assert id >= 0
--- a/capa/features/basicblock.py
+++ b/capa/features/basicblock.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/com/init.py
+++ b/capa/features/com/init.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/com/classes.py
+++ b/capa/features/com/classes.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/com/interfaces.py
+++ b/capa/features/com/interfaces.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -128,7 +128,7 @@ class Feature(abc.ABC):  # noqa: B024

    def __lt__(self, other):
        # implementing sorting by serializing to JSON is a huge hack.
-        # its slow, inelegant, and probably doesn't work intuitively;
+        # it's slow, inelegant, and probably doesn't work intuitively;
        # however, we only use it for deterministic output, so it's good enough for now.

        # circular import
@@ -166,10 +166,10 @@ class Feature(abc.ABC):  # noqa: B024
    def __repr__(self):
        return str(self)

-    def evaluate(self, ctx: Dict["Feature", Set[Address]], **kwargs) -> Result:
+    def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True) -> Result:
        capa.perf.counters["evaluate.feature"] += 1
        capa.perf.counters["evaluate.feature." + self.name] += 1
-        return Result(self in ctx, self, [], locations=ctx.get(self, set()))
+        return Result(self in features, self, [], locations=features.get(self, set()))


 class MatchedRule(Feature):
@@ -207,7 +207,7 @@ class Substring(String):
        super().__init__(value, description=description)
        self.value = value

-    def evaluate(self, ctx, short_circuit=True):
+    def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
        capa.perf.counters["evaluate.feature"] += 1
        capa.perf.counters["evaluate.feature.substring"] += 1

@@ -216,7 +216,7 @@ class Substring(String):
        matches: typing.DefaultDict[str, Set[Address]] = collections.defaultdict(set)

        assert isinstance(self.value, str)
-        for feature, locations in ctx.items():
+        for feature, locations in features.items():
            if not isinstance(feature, (String,)):
                continue

@@ -227,7 +227,7 @@ class Substring(String):
            if self.value in feature.value:
                matches[feature.value].update(locations)
                if short_circuit:
-                    # we found one matching string, thats sufficient to match.
+                    # we found one matching string, that's sufficient to match.
                    # don't collect other matching strings in this mode.
                    break

@@ -299,7 +299,7 @@ class Regex(String):
                f"invalid regular expression: {value} it should use Python syntax, try it at https://pythex.org"
            ) from exc

-    def evaluate(self, ctx, short_circuit=True):
+    def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
        capa.perf.counters["evaluate.feature"] += 1
        capa.perf.counters["evaluate.feature.regex"] += 1

@@ -307,7 +307,7 @@ class Regex(String):
        # will unique the locations later on.
        matches: typing.DefaultDict[str, Set[Address]] = collections.defaultdict(set)

-        for feature, locations in ctx.items():
+        for feature, locations in features.items():
            if not isinstance(feature, (String,)):
                continue

@@ -322,7 +322,7 @@ class Regex(String):
            if self.re.search(feature.value):
                matches[feature.value].update(locations)
                if short_circuit:
-                    # we found one matching string, thats sufficient to match.
+                    # we found one matching string, that's sufficient to match.
                    # don't collect other matching strings in this mode.
                    break

@@ -384,12 +384,14 @@ class Bytes(Feature):
        super().__init__(value, description=description)
        self.value = value

-    def evaluate(self, ctx, **kwargs):
+    def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
+        assert isinstance(self.value, bytes)
+
        capa.perf.counters["evaluate.feature"] += 1
        capa.perf.counters["evaluate.feature.bytes"] += 1
+        capa.perf.counters["evaluate.feature.bytes." + str(len(self.value))] += 1

-        assert isinstance(self.value, bytes)
-        for feature, locations in ctx.items():
+        for feature, locations in features.items():
            if not isinstance(feature, (Bytes,)):
                continue

@@ -407,9 +409,10 @@ class Bytes(Feature):
 # other candidates here: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#machine-types
 ARCH_I386 = "i386"
 ARCH_AMD64 = "amd64"
+ARCH_AARCH64 = "aarch64"
 # dotnet
 ARCH_ANY = "any"
-VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY)
+VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_AARCH64, ARCH_ANY)


 class Arch(Feature):
@@ -434,11 +437,11 @@ class OS(Feature):
        super().__init__(value, description=description)
        self.name = "os"

-    def evaluate(self, ctx, **kwargs):
+    def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
        capa.perf.counters["evaluate.feature"] += 1
        capa.perf.counters["evaluate.feature." + self.name] += 1

-        for feature, locations in ctx.items():
+        for feature, locations in features.items():
            if not isinstance(feature, (OS,)):
                continue

@@ -486,6 +489,6 @@ class Format(Feature):
 def is_global_feature(feature):
    """
    is this a feature that is extracted at every scope?
-    today, these are OS and arch features.
+    today, these are OS, arch, and format features.
    """
-    return isinstance(feature, (OS, Arch))
+    return isinstance(feature, (OS, Arch, Format))
--- a/capa/features/extractors/base_extractor.py
+++ b/capa/features/extractors/base_extractor.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -75,7 +75,7 @@ class BBHandle:

@dataclass
 class InsnHandle:
-    """reference to a instruction recognized by a feature extractor.
+    """reference to an instruction recognized by a feature extractor.

    Attributes:
        address: the address of the instruction address.
--- a/capa/features/extractors/binja/basicblock.py
+++ b/capa/features/extractors/binja/basicblock.py
@@ -7,17 +7,15 @@
 # See the License for the specific language governing permissions and limitations under the License.

 import string
-import struct
 from typing import Tuple, Iterator

-from binaryninja import Function, Settings
+from binaryninja import Function
 from binaryninja import BasicBlock as BinjaBasicBlock
 from binaryninja import (
    BinaryView,
    SymbolType,
    RegisterValueType,
    VariableSourceType,
-    MediumLevelILSetVar,
    MediumLevelILOperation,
    MediumLevelILBasicBlock,
    MediumLevelILInstruction,
@@ -29,11 +27,6 @@ from capa.features.basicblock import BasicBlock
 from capa.features.extractors.helpers import MIN_STACKSTRING_LEN
 from capa.features.extractors.base_extractor import BBHandle, FunctionHandle

-use_const_outline: bool = False
-settings: Settings = Settings()
-if settings.contains("analysis.outlining.builtins") and settings.get_bool("analysis.outlining.builtins"):
-    use_const_outline = True
-

 def get_printable_len_ascii(s: bytes) -> int:
    """Return string length if all operand bytes are ascii or utf16-le printable"""
@@ -65,7 +58,7 @@ def get_stack_string_len(f: Function, il: MediumLevelILInstruction) -> int:

    addr = target.value.value
    sym = bv.get_symbol_at(addr)
-    if not sym or sym.type != SymbolType.LibraryFunctionSymbol:
+    if not sym or sym.type not in [SymbolType.LibraryFunctionSymbol, SymbolType.SymbolicFunctionSymbol]:
        return 0

    if sym.name not in ["__builtin_strncpy", "__builtin_strcpy", "__builtin_wcscpy"]:
@@ -91,52 +84,6 @@ def get_stack_string_len(f: Function, il: MediumLevelILInstruction) -> int:
    return max(get_printable_len_ascii(bytes(s)), get_printable_len_wide(bytes(s)))


-def get_printable_len(il: MediumLevelILSetVar) -> int:
-    """Return string length if all operand bytes are ascii or utf16-le printable"""
-    width = il.dest.type.width
-    value = il.src.value.value
-
-    if width == 1:
-        chars = struct.pack("<B", value & 0xFF)
-    elif width == 2:
-        chars = struct.pack("<H", value & 0xFFFF)
-    elif width == 4:
-        chars = struct.pack("<I", value & 0xFFFFFFFF)
-    elif width == 8:
-        chars = struct.pack("<Q", value & 0xFFFFFFFFFFFFFFFF)
-    else:
-        return 0
-
-    def is_printable_ascii(chars_: bytes):
-        return all(c < 127 and chr(c) in string.printable for c in chars_)
-
-    def is_printable_utf16le(chars_: bytes):
-        if all(c == 0x00 for c in chars_[1::2]):
-            return is_printable_ascii(chars_[::2])
-
-    if is_printable_ascii(chars):
-        return width
-
-    if is_printable_utf16le(chars):
-        return width // 2
-
-    return 0
-
-
-def is_mov_imm_to_stack(il: MediumLevelILInstruction) -> bool:
-    """verify instruction moves immediate onto stack"""
-    if il.operation != MediumLevelILOperation.MLIL_SET_VAR:
-        return False
-
-    if il.src.operation != MediumLevelILOperation.MLIL_CONST:
-        return False
-
-    if il.dest.source_type != VariableSourceType.StackVariableSourceType:
-        return False
-
-    return True
-
-
 def bb_contains_stackstring(f: Function, bb: MediumLevelILBasicBlock) -> bool:
    """check basic block for stackstring indicators

@@ -144,14 +91,10 @@ def bb_contains_stackstring(f: Function, bb: MediumLevelILBasicBlock) -> bool:
    """
    count = 0
    for il in bb:
-        if use_const_outline:
-            count += get_stack_string_len(f, il)
-        else:
-            if is_mov_imm_to_stack(il):
-                count += get_printable_len(il)
+        count += get_stack_string_len(f, il)
+        if count > MIN_STACKSTRING_LEN:
+            return True

-    if count > MIN_STACKSTRING_LEN:
-        return True
    return False


--- a/capa/features/extractors/binja/file.py
+++ b/capa/features/extractors/binja/file.py
@@ -74,13 +74,18 @@ def extract_file_embedded_pe(bv: BinaryView) -> Iterator[Tuple[Feature, Address]

 def extract_file_export_names(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]:
    """extract function exports"""
-    for sym in bv.get_symbols_of_type(SymbolType.FunctionSymbol):
+    for sym in bv.get_symbols_of_type(SymbolType.FunctionSymbol) + bv.get_symbols_of_type(SymbolType.DataSymbol):
        if sym.binding in [SymbolBinding.GlobalBinding, SymbolBinding.WeakBinding]:
            name = sym.short_name
-            yield Export(name), AbsoluteVirtualAddress(sym.address)
-            unmangled_name = unmangle_c_name(name)
-            if name != unmangled_name:
-                yield Export(unmangled_name), AbsoluteVirtualAddress(sym.address)
+            if name.startswith("__forwarder_name(") and name.endswith(")"):
+                yield Export(name[17:-1]), AbsoluteVirtualAddress(sym.address)
+                yield Characteristic("forwarded export"), AbsoluteVirtualAddress(sym.address)
+            else:
+                yield Export(name), AbsoluteVirtualAddress(sym.address)
+
+                unmangled_name = unmangle_c_name(name)
+                if name != unmangled_name:
+                    yield Export(unmangled_name), AbsoluteVirtualAddress(sym.address)

    for sym in bv.get_symbols_of_type(SymbolType.DataSymbol):
        if sym.binding not in [SymbolBinding.GlobalBinding]:
--- a/capa/features/extractors/binja/find_binja_api.py
+++ b/capa/features/extractors/binja/find_binja_api.py
@@ -11,7 +11,7 @@ from pathlib import Path
 # When the script gets executed as a standalone executable (via PyInstaller), `import binaryninja` does not work because
 # we have excluded the binaryninja module in `pyinstaller.spec`. The trick here is to call the system Python and try
 # to find out the path of the binaryninja module that has been installed.
-# Note, including the binaryninja module in the `pyintaller.spec` would not work, since the binaryninja module tries to
+# Note, including the binaryninja module in the `pyinstaller.spec` would not work, since the binaryninja module tries to
 # find the binaryninja core e.g., `libbinaryninjacore.dylib`, using a relative path. And this does not work when the
 # binaryninja module is extracted by the PyInstaller.
 code = r"""
--- a/capa/features/extractors/cape/models.py
+++ b/capa/features/extractors/cape/models.py
@@ -46,7 +46,7 @@ class FlexibleModel(BaseModel):


 # use this type to indicate that we won't model this data.
-# because its not relevant to our use in capa.
+# because it's not relevant to our use in capa.
 #
 # while its nice to have full coverage of the data shape,
 # it can easily change and break our parsing.
@@ -230,7 +230,7 @@ class File(FlexibleModel):
    sha1: str
    sha256: str
    sha512: str
-    sha3_384: str
+    sha3_384: Optional[str] = None
    ssdeep: str
    # unsure why this would ever be "False"
    tlsh: Optional[Union[str, bool]] = None
@@ -356,8 +356,8 @@ class Behavior(ExactModel):
    anomaly: List[str]
    encryptedbuffers: List[EncryptedBuffer]
    # these are small objects that describe atomic events,
-    # like file move, registery access.
-    # we'll detect the same with our API call analyis.
+    # like file move, registry access.
+    # we'll detect the same with our API call analysis.
    enhanced: Skip = None


@@ -398,7 +398,7 @@ class CapeReport(FlexibleModel):
    behavior: Behavior

    # post-processed results: payloads and extracted configs
-    CAPE: Optional[Cape] = None
+    CAPE: Optional[Union[Cape, List]] = None
    dropped: Optional[List[File]] = None
    procdump: Optional[List[ProcessFile]] = None
    procmemory: ListTODO
--- a/capa/features/extractors/common.py
+++ b/capa/features/extractors/common.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/dnfile/extractor.py
+++ b/capa/features/extractors/dnfile/extractor.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/dnfile/file.py
+++ b/capa/features/extractors/dnfile/file.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/dnfile/function.py
+++ b/capa/features/extractors/dnfile/function.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/dnfile/helpers.py
+++ b/capa/features/extractors/dnfile/helpers.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -83,7 +83,7 @@ def read_dotnet_user_string(pe: dnfile.dnPE, token: StringToken) -> Optional[str
        return None

    try:
-        user_string: Optional[dnfile.stream.UserString] = pe.net.user_strings.get_us(token.rid)
+        user_string: Optional[dnfile.stream.UserString] = pe.net.user_strings.get(token.rid)
    except UnicodeDecodeError as e:
        logger.debug("failed to decode #US stream index 0x%08x (%s)", token.rid, e)
        return None
@@ -119,14 +119,14 @@ def get_dotnet_managed_imports(pe: dnfile.dnPE) -> Iterator[DnType]:
        access: Optional[str]

        # assume .NET imports starting with get_/set_ are used to access a property
-        if member_ref.Name.startswith("get_"):
+        member_ref_name: str = str(member_ref.Name)
+        if member_ref_name.startswith("get_"):
            access = FeatureAccess.READ
-        elif member_ref.Name.startswith("set_"):
+        elif member_ref_name.startswith("set_"):
            access = FeatureAccess.WRITE
        else:
            access = None

-        member_ref_name: str = member_ref.Name
        if member_ref_name.startswith(("get_", "set_")):
            # remove get_/set_ from MemberRef name
            member_ref_name = member_ref_name[4:]
@@ -212,7 +212,7 @@ def get_dotnet_managed_methods(pe: dnfile.dnPE) -> Iterator[DnType]:
            token: int = calculate_dotnet_token_value(method.table.number, method.row_index)
            access: Optional[str] = accessor_map.get(token)

-            method_name: str = method.row.Name
+            method_name: str = str(method.row.Name)
            if method_name.startswith(("get_", "set_")):
                # remove get_/set_
                method_name = method_name[4:]
@@ -289,8 +289,8 @@ def get_dotnet_unmanaged_imports(pe: dnfile.dnPE) -> Iterator[DnUnmanagedMethod]
            logger.debug("ImplMap[0x%X] ImportScope row is None", rid)
            module = ""
        else:
-            module = impl_map.ImportScope.row.Name
-        method: str = impl_map.ImportName
+            module = str(impl_map.ImportScope.row.Name)
+        method: str = str(impl_map.ImportName)

        member_forward_table: int
        if impl_map.MemberForwarded.table is None:
@@ -320,8 +320,11 @@ def get_dotnet_table_row(pe: dnfile.dnPE, table_index: int, row_index: int) -> O
    if row_index - 1 <= 0:
        return None

+    table: Optional[dnfile.base.ClrMetaDataTable] = pe.net.mdtables.tables.get(table_index)
+    if table is None:
+        return None
+
    try:
-        table = pe.net.mdtables.tables.get(table_index, [])
        return table[row_index - 1]
    except IndexError:
        return None
@@ -334,7 +337,7 @@ def resolve_nested_typedef_name(

    if index in nested_class_table:
        typedef_name = []
-        name = typedef.TypeName
+        name = str(typedef.TypeName)

        # Append the current typedef name
        typedef_name.append(name)
@@ -343,24 +346,24 @@ def resolve_nested_typedef_name(
            # Iterate through the typedef table to resolve the nested name
            table_row = get_dotnet_table_row(pe, dnfile.mdtable.TypeDef.number, nested_class_table[index])
            if table_row is None:
-                return typedef.TypeNamespace, tuple(typedef_name[::-1])
+                return str(typedef.TypeNamespace), tuple(typedef_name[::-1])

-            name = table_row.TypeName
+            name = str(table_row.TypeName)
            typedef_name.append(name)
            index = nested_class_table[index]

        # Document the root enclosing details
        table_row = get_dotnet_table_row(pe, dnfile.mdtable.TypeDef.number, nested_class_table[index])
        if table_row is None:
-            return typedef.TypeNamespace, tuple(typedef_name[::-1])
+            return str(typedef.TypeNamespace), tuple(typedef_name[::-1])

-        enclosing_name = table_row.TypeName
+        enclosing_name = str(table_row.TypeName)
        typedef_name.append(enclosing_name)

-        return table_row.TypeNamespace, tuple(typedef_name[::-1])
+        return str(table_row.TypeNamespace), tuple(typedef_name[::-1])

    else:
-        return typedef.TypeNamespace, (typedef.TypeName,)
+        return str(typedef.TypeNamespace), (str(typedef.TypeName),)


 def resolve_nested_typeref_name(
@@ -370,29 +373,29 @@ def resolve_nested_typeref_name(
    # If the ResolutionScope decodes to a typeRef type then it is nested
    if isinstance(typeref.ResolutionScope.table, dnfile.mdtable.TypeRef):
        typeref_name = []
-        name = typeref.TypeName
+        name = str(typeref.TypeName)
        # Not appending the current typeref name to avoid potential duplicate

        # Validate index
        table_row = get_dotnet_table_row(pe, dnfile.mdtable.TypeRef.number, index)
        if table_row is None:
-            return typeref.TypeNamespace, (typeref.TypeName,)
+            return str(typeref.TypeNamespace), (str(typeref.TypeName),)

        while isinstance(table_row.ResolutionScope.table, dnfile.mdtable.TypeRef):
            # Iterate through the typeref table to resolve the nested name
            typeref_name.append(name)
-            name = table_row.TypeName
+            name = str(table_row.TypeName)
            table_row = get_dotnet_table_row(pe, dnfile.mdtable.TypeRef.number, table_row.ResolutionScope.row_index)
            if table_row is None:
-                return typeref.TypeNamespace, tuple(typeref_name[::-1])
+                return str(typeref.TypeNamespace), tuple(typeref_name[::-1])

        # Document the root enclosing details
-        typeref_name.append(table_row.TypeName)
+        typeref_name.append(str(table_row.TypeName))

-        return table_row.TypeNamespace, tuple(typeref_name[::-1])
+        return str(table_row.TypeNamespace), tuple(typeref_name[::-1])

    else:
-        return typeref.TypeNamespace, (typeref.TypeName,)
+        return str(typeref.TypeNamespace), (str(typeref.TypeName),)


 def get_dotnet_nested_class_table_index(pe: dnfile.dnPE) -> Dict[int, int]:
--- a/capa/features/extractors/dnfile/insn.py
+++ b/capa/features/extractors/dnfile/insn.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/dnfile/types.py
+++ b/capa/features/extractors/dnfile/types.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/dotnetfile.py
+++ b/capa/features/extractors/dotnetfile.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -49,8 +49,8 @@ logger = logging.getLogger(__name__)


 def extract_file_format(**kwargs) -> Iterator[Tuple[Format, Address]]:
-    yield Format(FORMAT_PE), NO_ADDRESS
    yield Format(FORMAT_DOTNET), NO_ADDRESS
+    yield Format(FORMAT_PE), NO_ADDRESS


 def extract_file_import_names(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Import, Address]]:
@@ -78,12 +78,12 @@ def extract_file_namespace_features(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple
    for _, typedef in iter_dotnet_table(pe, dnfile.mdtable.TypeDef.number):
        # emit internal .NET namespaces
        assert isinstance(typedef, dnfile.mdtable.TypeDefRow)
-        namespaces.add(typedef.TypeNamespace)
+        namespaces.add(str(typedef.TypeNamespace))

    for _, typeref in iter_dotnet_table(pe, dnfile.mdtable.TypeRef.number):
        # emit external .NET namespaces
        assert isinstance(typeref, dnfile.mdtable.TypeRefRow)
-        namespaces.add(typeref.TypeNamespace)
+        namespaces.add(str(typeref.TypeNamespace))

    # namespaces may be empty, discard
    namespaces.discard("")
--- a/capa/features/extractors/elf.py
+++ b/capa/features/extractors/elf.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -10,10 +10,11 @@ import logging
 import itertools
 import collections
 from enum import Enum
-from typing import Set, Dict, List, Tuple, BinaryIO, Iterator, Optional
+from typing import TYPE_CHECKING, Set, Dict, List, Tuple, BinaryIO, Iterator, Optional
 from dataclasses import dataclass

-import Elf  # from vivisect
+if TYPE_CHECKING:
+    import Elf  # from vivisect

 logger = logging.getLogger(__name__)

@@ -57,6 +58,10 @@ class OS(str, Enum):
    SYLLABLE = "syllable"
    NACL = "nacl"
    ANDROID = "android"
+    DRAGONFLYBSD = "dragonfly BSD"
+    ILLUMOS = "illumos"
+    ZOS = "z/os"
+    UNIX = "unix"


 # via readelf: https://github.com/bminor/binutils-gdb/blob/c0e94211e1ac05049a4ce7c192c9d14d1764eb3e/binutils/readelf.c#L19635-L19658
@@ -80,6 +85,8 @@ class Phdr:
    paddr: int
    filesz: int
    buf: bytes
+    flags: int
+    memsz: int


@dataclass
@@ -205,7 +212,7 @@ class ELF:
        15: OS.AROS,
        16: OS.FENIXOS,
        17: OS.CLOUD,
-        # 53: "SORTFIX",      # i can't find any reference to this OS, i dont think it exists
+        # 53: "SORTFIX",      # i can't find any reference to this OS, i don't think it exists
        # 64: "ARM_AEABI",    # not an OS
        # 97: "ARM",          # not an OS
        # 255: "STANDALONE",  # not an OS
@@ -314,24 +321,23 @@ class ELF:
        phent_offset = i * self.e_phentsize
        phent = self.phbuf[phent_offset : phent_offset + self.e_phentsize]

-        (p_type,) = struct.unpack_from(self.endian + "I", phent, 0x0)
-        logger.debug("ph:p_type: 0x%04x", p_type)
-
        if self.bitness == 32:
-            p_offset, p_vaddr, p_paddr, p_filesz = struct.unpack_from(self.endian + "IIII", phent, 0x4)
+            p_type, p_offset, p_vaddr, p_paddr, p_filesz, p_memsz, p_flags = struct.unpack_from(
+                self.endian + "IIIIIII", phent, 0x0
+            )
        elif self.bitness == 64:
-            p_offset, p_vaddr, p_paddr, p_filesz = struct.unpack_from(self.endian + "QQQQ", phent, 0x8)
+            p_type, p_flags, p_offset, p_vaddr, p_paddr, p_filesz, p_memsz = struct.unpack_from(
+                self.endian + "IIQQQQQ", phent, 0x0
+            )
        else:
            raise NotImplementedError()

-        logger.debug("ph:p_offset: 0x%02x p_filesz: 0x%04x", p_offset, p_filesz)
-
        self.f.seek(p_offset)
        buf = self.f.read(p_filesz)
        if len(buf) != p_filesz:
            raise ValueError("failed to read program header content")

-        return Phdr(p_type, p_offset, p_vaddr, p_paddr, p_filesz, buf)
+        return Phdr(p_type, p_offset, p_vaddr, p_paddr, p_filesz, buf, p_flags, p_memsz)

    @property
    def program_headers(self):
@@ -356,8 +362,6 @@ class ELF:
        else:
            raise NotImplementedError()

-        logger.debug("sh:sh_offset: 0x%02x sh_size: 0x%04x", sh_offset, sh_size)
-
        self.f.seek(sh_offset)
        buf = self.f.read(sh_size)
        if len(buf) != sh_size:
@@ -724,7 +728,7 @@ class SymTab:
        yield from self.symbols

    @classmethod
-    def from_viv(cls, elf: Elf.Elf) -> Optional["SymTab"]:
+    def from_viv(cls, elf: "Elf.Elf") -> Optional["SymTab"]:
        endian = "<" if elf.getEndian() == 0 else ">"
        bitness = elf.bits

@@ -866,6 +870,8 @@ def guess_os_from_ident_directive(elf: ELF) -> Optional[OS]:
            return OS.LINUX
        elif "Red Hat" in comment:
            return OS.LINUX
+        elif "Alpine" in comment:
+            return OS.LINUX
        elif "Android" in comment:
            return OS.ANDROID

@@ -951,11 +957,506 @@ def guess_os_from_symtab(elf: ELF) -> Optional[OS]:

        for os, hints in keywords.items():
            if any(hint in sym_name for hint in hints):
+                logger.debug("symtab: %s looks like %s", sym_name, os)
                return os

    return None


+def is_go_binary(elf: ELF) -> bool:
+    for shdr in elf.section_headers:
+        if shdr.get_name(elf) == ".note.go.buildid":
+            logger.debug("go buildinfo: found section .note.go.buildid")
+            return True
+
+    # The `go version` command enumerates sections for the name `.go.buildinfo`
+    # (in addition to looking for the BUILDINFO_MAGIC) to check if an executable is go or not.
+    # See references to the `errNotGoExe` error here:
+    # https://github.com/golang/go/blob/master/src/debug/buildinfo/buildinfo.go#L41
+    for shdr in elf.section_headers:
+        if shdr.get_name(elf) == ".go.buildinfo":
+            logger.debug("go buildinfo: found section .go.buildinfo")
+            return True
+
+    # other strategy used by FLOSS: search for known runtime strings.
+    # https://github.com/mandiant/flare-floss/blob/b2ca8adfc5edf278861dd6bff67d73da39683b46/floss/language/identify.py#L88
+    return False
+
+
+def get_go_buildinfo_data(elf: ELF) -> Optional[bytes]:
+    for shdr in elf.section_headers:
+        if shdr.get_name(elf) == ".go.buildinfo":
+            logger.debug("go buildinfo: found section .go.buildinfo")
+            return shdr.buf
+
+    PT_LOAD = 0x1
+    PF_X = 1
+    PF_W = 2
+    for phdr in elf.program_headers:
+        if phdr.type != PT_LOAD:
+            continue
+
+        if (phdr.flags & (PF_X | PF_W)) == PF_W:
+            logger.debug("go buildinfo: found data segment")
+            return phdr.buf
+
+    return None
+
+
+def read_data(elf: ELF, rva: int, size: int) -> Optional[bytes]:
+    # ELF segments are for runtime data,
+    # ELF sections are for link-time data.
+    # So we want to read Program Headers/Segments.
+    for phdr in elf.program_headers:
+        if phdr.vaddr <= rva < phdr.vaddr + phdr.memsz:
+            segment_data = phdr.buf
+
+            # pad the section with NULLs
+            # assume page alignment is already handled.
+            # might need more hardening here.
+            if len(segment_data) < phdr.memsz:
+                segment_data += b"\x00" * (phdr.memsz - len(segment_data))
+
+            segment_offset = rva - phdr.vaddr
+            return segment_data[segment_offset : segment_offset + size]
+
+    return None
+
+
+def read_go_slice(elf: ELF, rva: int) -> Optional[bytes]:
+    if elf.bitness == 32:
+        struct_size = 8
+        struct_format = elf.endian + "II"
+    elif elf.bitness == 64:
+        struct_size = 16
+        struct_format = elf.endian + "QQ"
+    else:
+        raise ValueError("invalid psize")
+
+    struct_buf = read_data(elf, rva, struct_size)
+    if not struct_buf:
+        return None
+
+    addr, length = struct.unpack_from(struct_format, struct_buf, 0)
+
+    return read_data(elf, addr, length)
+
+
+def guess_os_from_go_buildinfo(elf: ELF) -> Optional[OS]:
+    """
+    In a binary compiled by Go, the buildinfo structure may contain
+    metadata about the build environment, including the configured
+    GOOS, which specifies the target operating system.
+
+    Search for and parse the buildinfo structure,
+    which may be found in the .go.buildinfo section,
+    and often contains this metadata inline. Otherwise,
+    follow a few byte slices to the relevant information.
+
+    This strategy is derived from GoReSym.
+    """
+    buf = get_go_buildinfo_data(elf)
+    if not buf:
+        logger.debug("go buildinfo: no buildinfo section")
+        return None
+
+    assert isinstance(buf, bytes)
+
+    # The build info blob left by the linker is identified by
+    # a 16-byte header, consisting of:
+    #  - buildInfoMagic (14 bytes),
+    #  - the binary's pointer size (1 byte), and
+    #  - whether the binary is big endian (1 byte).
+    #
+    # Then:
+    #  - virtual address to Go string: runtime.buildVersion
+    #  - virtual address to Go string: runtime.modinfo
+    #
+    #  On 32-bit platforms, the last 8 bytes are unused.
+    #
+    #  If the endianness has the 2 bit set, then the pointers are zero,
+    #  and the 32-byte header is followed by varint-prefixed string data
+    #  for the two string values we care about.
+    # https://github.com/mandiant/GoReSym/blob/0860a1b1b4f3495e9fb7e71eb4386bf3e0a7c500/buildinfo/buildinfo.go#L185-L193
+    BUILDINFO_MAGIC = b"\xFF Go buildinf:"
+
+    try:
+        index = buf.index(BUILDINFO_MAGIC)
+    except ValueError:
+        logger.debug("go buildinfo: no buildinfo magic")
+        return None
+
+    psize, flags = struct.unpack_from("<bb", buf, index + len(BUILDINFO_MAGIC))
+    assert psize in (4, 8)
+    is_big_endian = flags & 0b01
+    has_inline_strings = flags & 0b10
+    logger.debug("go buildinfo: psize: %d big endian: %s inline: %s", psize, is_big_endian, has_inline_strings)
+
+    GOOS_TO_OS = {
+        b"aix": OS.AIX,
+        b"android": OS.ANDROID,
+        b"dragonfly": OS.DRAGONFLYBSD,
+        b"freebsd": OS.FREEBSD,
+        b"hurd": OS.HURD,
+        b"illumos": OS.ILLUMOS,
+        b"linux": OS.LINUX,
+        b"netbsd": OS.NETBSD,
+        b"openbsd": OS.OPENBSD,
+        b"solaris": OS.SOLARIS,
+        b"zos": OS.ZOS,
+        b"windows": None,  # PE format
+        b"plan9": None,  # a.out format
+        b"ios": None,  # Mach-O format
+        b"darwin": None,  # Mach-O format
+        b"nacl": None,  # dropped in GO 1.14
+        b"js": None,
+    }
+
+    if has_inline_strings:
+        # This is the common case/path. Most samples will have an inline GOOS string.
+        #
+        # To find samples on VT, use these VTGrep searches:
+        #
+        #   content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 04 02}
+        #   content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 08 02}
+
+        # If present, the GOOS key will be found within
+        # the current buildinfo data region.
+        #
+        # Brute force the k-v pair, like `GOOS=linux`,
+        # rather than try to parse the data, which would be fragile.
+        for key, os in GOOS_TO_OS.items():
+            if (b"GOOS=" + key) in buf:
+                logger.debug("go buildinfo: found os: %s", os)
+                return os
+    else:
+        # This is the uncommon path. Most samples will have an inline GOOS string.
+        #
+        # To find samples on VT, use the referenced VTGrep content searches.
+        info_format = {
+            # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 04 00}
+            # like: 71e617e5cc7fda89bf67422ff60f437e9d54622382c5ed6ff31f75e601f9b22e
+            # in which the modinfo doesn't have GOOS.
+            (4, False): "<II",
+            # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 08 00}
+            # like: 93d3b3e2a904c6c909e20f2f76c3c2e8d0c81d535eb46e5493b5701f461816c3
+            # in which the modinfo doesn't have GOOS.
+            (8, False): "<QQ",
+            # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 04 01}
+            # (no matches on VT today)
+            (4, True): ">II",
+            # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 08 01}
+            # like: d44ba497964050c0e3dd2a192c511e4c3c4f17717f0322a554d64b797ee4690a
+            # in which the modinfo doesn't have GOOS.
+            (8, True): ">QQ",
+        }
+
+        build_version_address, modinfo_address = struct.unpack_from(
+            info_format[(psize, is_big_endian)], buf, index + 0x10
+        )
+        logger.debug("go buildinfo: build version address: 0x%x", build_version_address)
+        logger.debug("go buildinfo: modinfo address: 0x%x", modinfo_address)
+
+        build_version = read_go_slice(elf, build_version_address)
+        if build_version:
+            logger.debug("go buildinfo: build version: %s", build_version.decode("utf-8"))
+
+        modinfo = read_go_slice(elf, modinfo_address)
+        if modinfo:
+            if modinfo[-0x11] == ord("\n"):
+                # Strip module framing: sentinel strings delimiting the module info.
+                # These are cmd/go/internal/modload/build.infoStart and infoEnd.
+                # Which should probably be:
+                # 	infoStart, _ = hex.DecodeString("3077af0c9274080241e1c107e6d618e6")
+                #   infoEnd, _   = hex.DecodeString("f932433186182072008242104116d8f2")
+                modinfo = modinfo[0x10:-0x10]
+            logger.debug("go buildinfo: modinfo: %s", modinfo.decode("utf-8"))
+
+        if not modinfo:
+            return None
+
+        for key, os in GOOS_TO_OS.items():
+            # Brute force the k-v pair, like `GOOS=linux`,
+            # rather than try to parse the data, which would be fragile.
+            if (b"GOOS=" + key) in modinfo:
+                logger.debug("go buildinfo: found os: %s", os)
+                return os
+
+    return None
+
+
+def guess_os_from_go_source(elf: ELF) -> Optional[OS]:
+    """
+    In a binary compiled by Go, runtime metadata may contain
+    references to the source filenames, including the
+    src/runtime/os_* files, whose name indicates the
+    target operating system.
+
+    Confirm the given ELF seems to be built by Go,
+    and then look for strings that look like
+    Go source filenames.
+
+    This strategy is derived from GoReSym.
+    """
+    if not is_go_binary(elf):
+        return None
+
+    for phdr in elf.program_headers:
+        buf = phdr.buf
+        NEEDLE_OS = b"/src/runtime/os_"
+        try:
+            index = buf.index(NEEDLE_OS)
+        except ValueError:
+            continue
+
+        rest = buf[index + len(NEEDLE_OS) : index + len(NEEDLE_OS) + 32]
+        filename = rest.partition(b".go")[0].decode("utf-8")
+        logger.debug("go source: filename: /src/runtime/os_%s.go", filename)
+
+        # via: https://cs.opensource.google/go/go/+/master:src/runtime/;bpv=1;bpt=0
+        # candidates today:
+        #   - aix
+        #   - android
+        #   - darwin
+        #   - darwin_arm64
+        #   - dragonfly
+        #   - freebsd
+        #   - freebsd2
+        #   - freebsd_amd64
+        #   - freebsd_arm
+        #   - freebsd_arm64
+        #   - freebsd_noauxv
+        #   - freebsd_riscv64
+        #   - illumos
+        #   - js
+        #   - linux
+        #   - linux_arm
+        #   - linux_arm64
+        #   - linux_be64
+        #   - linux_generic
+        #   - linux_loong64
+        #   - linux_mips64x
+        #   - linux_mipsx
+        #   - linux_noauxv
+        #   - linux_novdso
+        #   - linux_ppc64x
+        #   - linux_riscv64
+        #   - linux_s390x
+        #   - linux_x86
+        #   - netbsd
+        #   - netbsd_386
+        #   - netbsd_amd64
+        #   - netbsd_arm
+        #   - netbsd_arm64
+        #   - nonopenbsd
+        #   - only_solaris
+        #   - openbsd
+        #   - openbsd_arm
+        #   - openbsd_arm64
+        #   - openbsd_libc
+        #   - openbsd_mips64
+        #   - openbsd_syscall
+        #   - openbsd_syscall1
+        #   - openbsd_syscall2
+        #   - plan9
+        #   - plan9_arm
+        #   - solaris
+        #   - unix
+        #   - unix_nonlinux
+        #   - wasip1
+        #   - wasm
+        #   - windows
+        #   - windows_arm
+        #   - windows_arm64
+
+        OS_FILENAME_TO_OS = {
+            "aix": OS.AIX,
+            "android": OS.ANDROID,
+            "dragonfly": OS.DRAGONFLYBSD,
+            "freebsd": OS.FREEBSD,
+            "freebsd2": OS.FREEBSD,
+            "freebsd_": OS.FREEBSD,
+            "illumos": OS.ILLUMOS,
+            "linux": OS.LINUX,
+            "netbsd": OS.NETBSD,
+            "only_solaris": OS.SOLARIS,
+            "openbsd": OS.OPENBSD,
+            "solaris": OS.SOLARIS,
+            "unix_nonlinux": OS.UNIX,
+        }
+
+        for prefix, os in OS_FILENAME_TO_OS.items():
+            if filename.startswith(prefix):
+                return os
+
+    for phdr in elf.program_headers:
+        buf = phdr.buf
+        NEEDLE_RT0 = b"/src/runtime/rt0_"
+        try:
+            index = buf.index(NEEDLE_RT0)
+        except ValueError:
+            continue
+
+        rest = buf[index + len(NEEDLE_RT0) : index + len(NEEDLE_RT0) + 32]
+        filename = rest.partition(b".s")[0].decode("utf-8")
+        logger.debug("go source: filename: /src/runtime/rt0_%s.s", filename)
+
+        # via: https://cs.opensource.google/go/go/+/master:src/runtime/;bpv=1;bpt=0
+        # candidates today:
+        #   - aix_ppc64
+        #   - android_386
+        #   - android_amd64
+        #   - android_arm
+        #   - android_arm64
+        #   - darwin_amd64
+        #   - darwin_arm64
+        #   - dragonfly_amd64
+        #   - freebsd_386
+        #   - freebsd_amd64
+        #   - freebsd_arm
+        #   - freebsd_arm64
+        #   - freebsd_riscv64
+        #   - illumos_amd64
+        #   - ios_amd64
+        #   - ios_arm64
+        #   - js_wasm
+        #   - linux_386
+        #   - linux_amd64
+        #   - linux_arm
+        #   - linux_arm64
+        #   - linux_loong64
+        #   - linux_mips64x
+        #   - linux_mipsx
+        #   - linux_ppc64
+        #   - linux_ppc64le
+        #   - linux_riscv64
+        #   - linux_s390x
+        #   - netbsd_386
+        #   - netbsd_amd64
+        #   - netbsd_arm
+        #   - netbsd_arm64
+        #   - openbsd_386
+        #   - openbsd_amd64
+        #   - openbsd_arm
+        #   - openbsd_arm64
+        #   - openbsd_mips64
+        #   - openbsd_ppc64
+        #   - openbsd_riscv64
+        #   - plan9_386
+        #   - plan9_amd64
+        #   - plan9_arm
+        #   - solaris_amd64
+        #   - wasip1_wasm
+        #   - windows_386
+        #   - windows_amd64
+        #   - windows_arm
+        #   - windows_arm64
+
+        RT0_FILENAME_TO_OS = {
+            "aix": OS.AIX,
+            "android": OS.ANDROID,
+            "dragonfly": OS.DRAGONFLYBSD,
+            "freebsd": OS.FREEBSD,
+            "illumos": OS.ILLUMOS,
+            "linux": OS.LINUX,
+            "netbsd": OS.NETBSD,
+            "openbsd": OS.OPENBSD,
+            "solaris": OS.SOLARIS,
+        }
+
+        for prefix, os in RT0_FILENAME_TO_OS.items():
+            if filename.startswith(prefix):
+                return os
+
+    return None
+
+
+def guess_os_from_vdso_strings(elf: ELF) -> Optional[OS]:
+    """
+    The "vDSO" (virtual dynamic shared object) is a small shared
+    library that the kernel automatically maps into the address space
+    of all user-space applications.
+
+    Some statically linked executables include small dynamic linker
+    routines that finds these vDSO symbols, using the ASCII
+    symbol name and version. We can therefore recognize the pairs
+    (symbol, version) to guess the binary targets Linux.
+    """
+    for phdr in elf.program_headers:
+        buf = phdr.buf
+
+        # We don't really use the arch, but its interesting for documentation
+        # I suppose we could restrict the arch here to what's in the ELF header,
+        # but that's even more work. Let's see if this is sufficient.
+        for arch, symbol, version in (
+            # via: https://man7.org/linux/man-pages/man7/vdso.7.html
+            ("arm", b"__vdso_gettimeofday", b"LINUX_2.6"),
+            ("arm", b"__vdso_clock_gettime", b"LINUX_2.6"),
+            ("aarch64", b"__kernel_rt_sigreturn", b"LINUX_2.6.39"),
+            ("aarch64", b"__kernel_gettimeofday", b"LINUX_2.6.39"),
+            ("aarch64", b"__kernel_clock_gettime", b"LINUX_2.6.39"),
+            ("aarch64", b"__kernel_clock_getres", b"LINUX_2.6.39"),
+            ("mips", b"__kernel_gettimeofday", b"LINUX_2.6"),
+            ("mips", b"__kernel_clock_gettime", b"LINUX_2.6"),
+            ("ia64", b"__kernel_sigtramp", b"LINUX_2.5"),
+            ("ia64", b"__kernel_syscall_via_break", b"LINUX_2.5"),
+            ("ia64", b"__kernel_syscall_via_epc", b"LINUX_2.5"),
+            ("ppc/32", b"__kernel_clock_getres", b"LINUX_2.6.15"),
+            ("ppc/32", b"__kernel_clock_gettime", b"LINUX_2.6.15"),
+            ("ppc/32", b"__kernel_clock_gettime64", b"LINUX_5.11"),
+            ("ppc/32", b"__kernel_datapage_offset", b"LINUX_2.6.15"),
+            ("ppc/32", b"__kernel_get_syscall_map", b"LINUX_2.6.15"),
+            ("ppc/32", b"__kernel_get_tbfreq", b"LINUX_2.6.15"),
+            ("ppc/32", b"__kernel_getcpu", b"LINUX_2.6.15"),
+            ("ppc/32", b"__kernel_gettimeofday", b"LINUX_2.6.15"),
+            ("ppc/32", b"__kernel_sigtramp_rt32", b"LINUX_2.6.15"),
+            ("ppc/32", b"__kernel_sigtramp32", b"LINUX_2.6.15"),
+            ("ppc/32", b"__kernel_sync_dicache", b"LINUX_2.6.15"),
+            ("ppc/32", b"__kernel_sync_dicache_p5", b"LINUX_2.6.15"),
+            ("ppc/64", b"__kernel_clock_getres", b"LINUX_2.6.15"),
+            ("ppc/64", b"__kernel_clock_gettime", b"LINUX_2.6.15"),
+            ("ppc/64", b"__kernel_datapage_offset", b"LINUX_2.6.15"),
+            ("ppc/64", b"__kernel_get_syscall_map", b"LINUX_2.6.15"),
+            ("ppc/64", b"__kernel_get_tbfreq", b"LINUX_2.6.15"),
+            ("ppc/64", b"__kernel_getcpu", b"LINUX_2.6.15"),
+            ("ppc/64", b"__kernel_gettimeofday", b"LINUX_2.6.15"),
+            ("ppc/64", b"__kernel_sigtramp_rt64", b"LINUX_2.6.15"),
+            ("ppc/64", b"__kernel_sync_dicache", b"LINUX_2.6.15"),
+            ("ppc/64", b"__kernel_sync_dicache_p5", b"LINUX_2.6.15"),
+            ("riscv", b"__vdso_rt_sigreturn", b"LINUX_4.15"),
+            ("riscv", b"__vdso_gettimeofday", b"LINUX_4.15"),
+            ("riscv", b"__vdso_clock_gettime", b"LINUX_4.15"),
+            ("riscv", b"__vdso_clock_getres", b"LINUX_4.15"),
+            ("riscv", b"__vdso_getcpu", b"LINUX_4.15"),
+            ("riscv", b"__vdso_flush_icache", b"LINUX_4.15"),
+            ("s390", b"__kernel_clock_getres", b"LINUX_2.6.29"),
+            ("s390", b"__kernel_clock_gettime", b"LINUX_2.6.29"),
+            ("s390", b"__kernel_gettimeofday", b"LINUX_2.6.29"),
+            ("superh", b"__kernel_rt_sigreturn", b"LINUX_2.6"),
+            ("superh", b"__kernel_sigreturn", b"LINUX_2.6"),
+            ("superh", b"__kernel_vsyscall", b"LINUX_2.6"),
+            ("i386", b"__kernel_sigreturn", b"LINUX_2.5"),
+            ("i386", b"__kernel_rt_sigreturn", b"LINUX_2.5"),
+            ("i386", b"__kernel_vsyscall", b"LINUX_2.5"),
+            ("i386", b"__vdso_clock_gettime", b"LINUX_2.6"),
+            ("i386", b"__vdso_gettimeofday", b"LINUX_2.6"),
+            ("i386", b"__vdso_time", b"LINUX_2.6"),
+            ("x86-64", b"__vdso_clock_gettime", b"LINUX_2.6"),
+            ("x86-64", b"__vdso_getcpu", b"LINUX_2.6"),
+            ("x86-64", b"__vdso_gettimeofday", b"LINUX_2.6"),
+            ("x86-64", b"__vdso_time", b"LINUX_2.6"),
+            ("x86/32", b"__vdso_clock_gettime", b"LINUX_2.6"),
+            ("x86/32", b"__vdso_getcpu", b"LINUX_2.6"),
+            ("x86/32", b"__vdso_gettimeofday", b"LINUX_2.6"),
+            ("x86/32", b"__vdso_time", b"LINUX_2.6"),
+        ):
+            if symbol in buf and version in buf:
+                logger.debug("vdso string: %s %s %s", arch, symbol.decode("ascii"), version.decode("ascii"))
+                return OS.LINUX
+
+    return None
+
+
 def detect_elf_os(f) -> str:
    """
    f: type Union[BinaryIO, IDAIO, GHIDRAIO]
@@ -1022,6 +1523,27 @@ def detect_elf_os(f) -> str:
        logger.warning("Error guessing OS from symbol table: %s", e)
        symtab_guess = None

+    try:
+        goos_guess = guess_os_from_go_buildinfo(elf)
+        logger.debug("guess: Go buildinfo: %s", goos_guess)
+    except Exception as e:
+        logger.warning("Error guessing OS from Go buildinfo: %s", e)
+        goos_guess = None
+
+    try:
+        gosrc_guess = guess_os_from_go_source(elf)
+        logger.debug("guess: Go source: %s", gosrc_guess)
+    except Exception as e:
+        logger.warning("Error guessing OS from Go source path: %s", e)
+        gosrc_guess = None
+
+    try:
+        vdso_guess = guess_os_from_vdso_strings(elf)
+        logger.debug("guess: vdso strings: %s", vdso_guess)
+    except Exception as e:
+        logger.warning("Error guessing OS from vdso strings: %s", e)
+        symtab_guess = None
+
    ret = None

    if osabi_guess:
@@ -1045,11 +1567,24 @@ def detect_elf_os(f) -> str:
    elif symtab_guess:
        ret = symtab_guess

+    elif goos_guess:
+        ret = goos_guess
+
+    elif gosrc_guess:
+        # prefer goos_guess to this method,
+        # which is just string interpretation.
+        ret = gosrc_guess
+
    elif ident_guess:
        # at the bottom because we don't trust this too much
        # due to potential for bugs with cross-compilation.
        ret = ident_guess

+    elif vdso_guess:
+        # at the bottom because this is just scanning strings,
+        # which isn't very authoritative.
+        ret = vdso_guess
+
    return ret.value if ret is not None else "unknown"


--- a/capa/features/extractors/elffile.py
+++ b/capa/features/extractors/elffile.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/ghidra/extractor.py
+++ b/capa/features/extractors/ghidra/extractor.py
@@ -34,7 +34,7 @@ class GhidraFeatureExtractor(StaticFeatureExtractor):
                # https://ghidra.re/ghidra_docs/api/ghidra/program/model/listing/Program.html
                #
                # the hashes are stored in the database, not computed on the fly,
-                # so its probably not trivial to add SHA1.
+                # so it's probably not trivial to add SHA1.
                sha1="",
                sha256=capa.ghidra.helpers.get_file_sha256(),
            )
--- a/capa/features/extractors/ghidra/helpers.py
+++ b/capa/features/extractors/ghidra/helpers.py
@@ -260,7 +260,7 @@ def dereference_ptr(insn: ghidra.program.database.code.InstructionDB):
        if thfunc and thfunc.isThunk():
            return handle_thunk(to_deref)
        else:
-            # if it doesn't poin to a thunk, it's usually a jmp to a label
+            # if it doesn't point to a thunk, it's usually a jmp to a label
            return to_deref
    if not dat:
        return to_deref
--- a/capa/features/extractors/helpers.py
+++ b/capa/features/extractors/helpers.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/ida/basicblock.py
+++ b/capa/features/extractors/ida/basicblock.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/ida/extractor.py
+++ b/capa/features/extractors/ida/extractor.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/ida/file.py
+++ b/capa/features/extractors/ida/file.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/ida/function.py
+++ b/capa/features/extractors/ida/function.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/ida/global_.py
+++ b/capa/features/extractors/ida/global_.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/ida/helpers.py
+++ b/capa/features/extractors/ida/helpers.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -10,6 +10,7 @@ from typing import Any, Dict, Tuple, Iterator, Optional

 import idc
 import idaapi
+import ida_nalt
 import idautils
 import ida_bytes
 import ida_segment
@@ -17,6 +18,8 @@ import ida_segment
 from capa.features.address import AbsoluteVirtualAddress
 from capa.features.extractors.base_extractor import FunctionHandle

+IDA_NALT_ENCODING = ida_nalt.get_default_encoding_idx(ida_nalt.BPU_1B)  # use one byte-per-character encoding
+

 def find_byte_sequence(start: int, end: int, seq: bytes) -> Iterator[int]:
    """yield all ea of a given byte sequence
@@ -26,11 +29,16 @@ def find_byte_sequence(start: int, end: int, seq: bytes) -> Iterator[int]:
        end: max virtual address
        seq: bytes to search e.g. b"\x01\x03"
    """
+    patterns = ida_bytes.compiled_binpat_vec_t()
+
    seqstr = " ".join([f"{b:02x}" for b in seq])
+    err = ida_bytes.parse_binpat_str(patterns, 0, seqstr, 16, IDA_NALT_ENCODING)
+
+    if err:
+        return
+
    while True:
-        # TODO(mike-hunhoff): find_binary is deprecated. Please use ida_bytes.bin_search() instead.
-        # https://github.com/mandiant/capa/issues/1606
-        ea = idaapi.find_binary(start, end, seqstr, 0, idaapi.SEARCH_DOWN)
+        ea = ida_bytes.bin_search(start, end, patterns, ida_bytes.BIN_SEARCH_FORWARD)
        if ea == idaapi.BADADDR:
            break
        start = ea + 1
--- a/capa/features/extractors/ida/insn.py
+++ b/capa/features/extractors/ida/insn.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/loops.py
+++ b/capa/features/extractors/loops.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/null.py
+++ b/capa/features/extractors/null.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/pefile.py
+++ b/capa/features/extractors/pefile.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/strings.py
+++ b/capa/features/extractors/strings.py
@@ -1,6 +1,6 @@
 # strings code from FLOSS, https://github.com/mandiant/flare-floss
 #
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/viv/basicblock.py
+++ b/capa/features/extractors/viv/basicblock.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/viv/extractor.py
+++ b/capa/features/extractors/viv/extractor.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/viv/file.py
+++ b/capa/features/extractors/viv/file.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/viv/function.py
+++ b/capa/features/extractors/viv/function.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/viv/global_.py
+++ b/capa/features/extractors/viv/global_.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/viv/helpers.py
+++ b/capa/features/extractors/viv/helpers.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/viv/indirect_calls.py
+++ b/capa/features/extractors/viv/indirect_calls.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/extractors/viv/insn.py
+++ b/capa/features/extractors/viv/insn.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -113,7 +113,7 @@ def extract_insn_api_features(fh: FunctionHandle, bb, ih: InsnHandle) -> Iterato
        if f.vw.metadata["Format"] == "elf":
            if "symtab" not in fh.ctx["cache"]:
                # the symbol table gets stored as a function's attribute in order to avoid running
-                # this code everytime the call is made, thus preventing the computational overhead.
+                # this code every time the call is made, thus preventing the computational overhead.
                try:
                    fh.ctx["cache"]["symtab"] = SymTab.from_viv(f.vw.parsedbin)
                except Exception:
@@ -598,7 +598,7 @@ def extract_op_number_features(

    if f.vw.probeMemory(v, 1, envi.memory.MM_READ):
        # this is a valid address
-        # assume its not also a constant.
+        # assume it's not also a constant.
        return

    if insn.mnem == "add" and insn.opers[0].isReg() and insn.opers[0].reg == envi.archs.i386.regs.REG_ESP:
--- a/capa/features/file.py
+++ b/capa/features/file.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/features/freeze/init.py
+++ b/capa/features/freeze/init.py
@@ -1,7 +1,7 @@
 """
 capa freeze file format: `| capa0000 | + zlib(utf-8(json(...)))`

-Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -382,7 +382,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
                    address=Address.from_capa(addr),
                    feature=feature_from_capa(feature),
                )  # type: ignore
-                # Mypy is unable to recognise `basic_block` as a argument due to alias
+                # Mypy is unable to recognise `basic_block` as an argument due to alias
                for feature, addr in extractor.extract_basic_block_features(f, bb)
            ]

@@ -419,7 +419,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
                features=tuple(ffeatures),
                basic_blocks=basic_blocks,
            )  # type: ignore
-            # Mypy is unable to recognise `basic_blocks` as a argument due to alias
+            # Mypy is unable to recognise `basic_blocks` as an argument due to alias
        )

    features = StaticFeatures(
@@ -427,7 +427,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
        file=tuple(file_features),
        functions=tuple(function_features),
    )  # type: ignore
-    # Mypy is unable to recognise `global_` as a argument due to alias
+    # Mypy is unable to recognise `global_` as an argument due to alias

    freeze = Freeze(
        version=CURRENT_VERSION,
@@ -437,7 +437,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
        extractor=Extractor(name=extractor.__class__.__name__),
        features=features,
    )  # type: ignore
-    # Mypy is unable to recognise `base_address` as a argument due to alias
+    # Mypy is unable to recognise `base_address` as an argument due to alias

    return freeze.model_dump_json()

@@ -485,7 +485,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
                    address=Address.from_capa(addr),
                    feature=feature_from_capa(feature),
                )  # type: ignore
-                # Mypy is unable to recognise `basic_block` as a argument due to alias
+                # Mypy is unable to recognise `basic_block` as an argument due to alias
                for feature, addr in extractor.extract_thread_features(p, t)
            ]

@@ -532,7 +532,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
        file=tuple(file_features),
        processes=tuple(process_features),
    )  # type: ignore
-    # Mypy is unable to recognise `global_` as a argument due to alias
+    # Mypy is unable to recognise `global_` as an argument due to alias

    # workaround around mypy issue: https://github.com/python/mypy/issues/1424
    get_base_addr = getattr(extractor, "get_base_addr", None)
@@ -546,7 +546,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
        extractor=Extractor(name=extractor.__class__.__name__),
        features=features,
    )  # type: ignore
-    # Mypy is unable to recognise `base_address` as a argument due to alias
+    # Mypy is unable to recognise `base_address` as an argument due to alias

    return freeze.model_dump_json()

--- a/capa/features/freeze/features.py
+++ b/capa/features/freeze/features.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2022 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -132,7 +132,7 @@ def feature_from_capa(f: capa.features.common.Feature) -> "Feature":
    elif isinstance(f, capa.features.file.Import):
        assert isinstance(f.value, str)
        return ImportFeature(import_=f.value, description=f.description)  # type: ignore
-        # Mypy is unable to recognise `import_` as a argument due to alias
+        # Mypy is unable to recognise `import_` as an argument due to alias

    elif isinstance(f, capa.features.file.Section):
        assert isinstance(f.value, str)
@@ -141,7 +141,7 @@ def feature_from_capa(f: capa.features.common.Feature) -> "Feature":
    elif isinstance(f, capa.features.file.FunctionName):
        assert isinstance(f.value, str)
        return FunctionNameFeature(function_name=f.value, description=f.description)  # type: ignore
-        # Mypy is unable to recognise `function_name` as a argument due to alias
+        # Mypy is unable to recognise `function_name` as an argument due to alias

    # must come before check for String due to inheritance
    elif isinstance(f, capa.features.common.Substring):
@@ -160,7 +160,7 @@ def feature_from_capa(f: capa.features.common.Feature) -> "Feature":
    elif isinstance(f, capa.features.common.Class):
        assert isinstance(f.value, str)
        return ClassFeature(class_=f.value, description=f.description)  # type: ignore
-        # Mypy is unable to recognise `class_` as a argument due to alias
+        # Mypy is unable to recognise `class_` as an argument due to alias

    elif isinstance(f, capa.features.common.Namespace):
        assert isinstance(f.value, str)
@@ -197,12 +197,12 @@ def feature_from_capa(f: capa.features.common.Feature) -> "Feature":
    elif isinstance(f, capa.features.insn.OperandNumber):
        assert isinstance(f.value, int)
        return OperandNumberFeature(index=f.index, operand_number=f.value, description=f.description)  # type: ignore
-        # Mypy is unable to recognise `operand_number` as a argument due to alias
+        # Mypy is unable to recognise `operand_number` as an argument due to alias

    elif isinstance(f, capa.features.insn.OperandOffset):
        assert isinstance(f.value, int)
        return OperandOffsetFeature(index=f.index, operand_offset=f.value, description=f.description)  # type: ignore
-        # Mypy is unable to recognise `operand_offset` as a argument due to alias
+        # Mypy is unable to recognise `operand_offset` as an argument due to alias

    else:
        raise NotImplementedError(f"feature_from_capa({type(f)}) not implemented")
--- a/capa/features/insn.py
+++ b/capa/features/insn.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/ghidra/README.md
+++ b/capa/ghidra/README.md
@@ -1,184 +1,107 @@
 <div align="center">
-    <img src="/doc/img/ghidra_backend_logo.png" width=300 height=175>
+    <img src="../../doc/img/ghidra_backend_logo.png" width=240 height=125>
 </div>

-The Ghidra feature extractor is an application of the FLARE team's open-source project, Ghidrathon, to integrate capa with Ghidra using Python 3. capa is a framework that uses a well-defined collection of rules to identify capabilities in a program. You can run capa against a PE file, ELF file, or shellcode and it tells you what it thinks the program can do. For example, it might suggest that the program is a backdoor, can install services, or relies on HTTP to communicate. The Ghidra feature extractor can be used to run capa analysis on your Ghidra databases without needing access to the original binary file. As a part of this integration, we've developed two scripts, [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py), to display capa results directly in Ghidra.
+# capa + Ghidra

-### Using `capa_explorer.py`
+[capa](https://github.com/mandiant/capa) is the FLARE team’s open-source tool that detects capabilities in executable files. [Ghidra](https://github.com/NationalSecurityAgency/ghidra) is an open-source software reverse engineering framework created and maintained by the National Security Agency Research Directorate. capa + Ghidra brings capa’s detection capabilities directly to Ghidra’s user interface helping speed up your reverse engineering tasks by identifying what parts of a program suggest interesting behavior, such as setting a registry value. You can execute the included Python 3 scripts [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) or [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) to run capa’s analysis and view the results in Ghidra. You may be asking yourself, “Python 3 scripts in Ghidra?”. You read that correctly. This integration is written entirely in Python 3 and relies on [Ghidrathon]( https://github.com/mandiant/ghidrathon), an open source Ghidra extension that adds Python 3 scripting to Ghidra.

-`capa_explorer.py` integrates capa results directly into Ghidra's UI. In the Symbol Tree Window, under the Namespaces section, you can find the matched rules as well as the corresponding functions that contain the matched features:
+Check out our capa + Ghidra blog posts:
+* [Riding Dragons: capa Harnesses Ghidra](https://www.mandiant.com/resources/blog/capa-harnesses-ghidra)

-![image](https://github.com/mandiant/capa/assets/66766340/eeae33f4-99d4-42dc-a5e8-4c1b8c661492)
+## UI Integration
+[capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) renders capa results in Ghidra's UI to help you quickly navigate them. This includes adding matched functions to Ghidra’s Symbol Tree and Bookmarks windows and adding comments to functions that indicate matched capabilities and features. You can execute this script using Ghidra’s Script Manager window.

-Labeled functions may be clicked in the Symbol Tree Window to navigate Ghidra's Disassembly Listing and Decompilation windows to the function locations. A comment listing each matched capa rule is inserted at the beginning of the function and a comment for each matched capa feature is added at the matched address within the function. These comments can be viewed using Ghidra's Disassembly Listing and Decompilation windows:
+### Symbol Tree Window
+Matched functions are added to Ghidra's Symbol Tree window under a custom namespace that maps to the capabilities' [capa namespace](https://github.com/mandiant/capa-rules/blob/master/doc/format.md#rule-namespace).
+<div align="center">
+    <img src="https://github.com/mandiant/capa/assets/66766340/eeae33f4-99d4-42dc-a5e8-4c1b8c661492" width=300>
+</div>

-![image](https://github.com/mandiant/capa/assets/66766340/bb2b4170-7fd4-45fc-8c7b-ff8f2e2f101b)
+### Comments

-The script also adds bookmarks for capa matches that are categorized under MITRE ATT&CK and Malware Behavior Catalog. These may be found and navigated using Ghidra's Bookmarks Window:
+Comments are added at the beginning of matched functions indicating matched capabilities and inline comments are added to functions indicating matched features. You can view these comments in Ghidra’s Disassembly Listing and Decompile windows.
+<div align="center">
+    <img src="https://github.com/mandiant/capa/assets/66766340/bb2b4170-7fd4-45fc-8c7b-ff8f2e2f101b" width=1000>
+</div>

-![image](https://github.com/mandiant/capa/assets/66766340/7f9a66a9-7be7-4223-91c6-4b8fc4651336)
+### Bookmarks

-### Using `capa_ghidra.py`
+Bookmarks are added to functions that matched a capability that is mapped to a MITRE ATT&CK and/or Malware Behavior Catalog (MBC) technique. You can view these bookmarks in Ghidra's Bookmarks window.
+<div align="center">
+    <img src="https://github.com/mandiant/capa/assets/66766340/7f9a66a9-7be7-4223-91c6-4b8fc4651336" width=825>
+</div>

-`capa_ghidra.py` displays capa results in Ghidra's Console window and can be executed using Ghidra's Headless Analyzer. The following is an example of running `capa_ghidra.py` using the Ghidra Script Manager:
+## Text-based Integration

-Selecting capa rules:
-<img src="/doc/img/ghidra_script_mngr_rules.png">
+[capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) outputs text-based capa results that mirror the output of capa’s standalone tool. You can execute this script using Ghidra’s Script Manager and view its output in Ghidra’s Console window.

-Choosing output format:
-<img src="/doc/img/ghidra_script_mngr_verbosity.png">
+<div align="center">
+  <img src="../../doc/img/ghidra_script_mngr_output.png" width=700>
+</div>

-Viewing results in Ghidra Console Window:
-<img src="/doc/img/ghidra_script_mngr_output.png">
+You can also execute [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using Ghidra's Headless Analyzer to view its output in a terminal window.

-## Installation
+<div align="center">
+  <img src="../../doc/img/ghidra_headless_analyzer.png">
+</div>

-### Requirements
+# Getting Started
+
+## Requirements

 | Tool | Version | Source |
 |------------|---------|--------|
+| capa | `>= 7.0.0` | https://github.com/mandiant/capa/releases |
 | Ghidrathon | `>= 3.0.0` | https://github.com/mandiant/Ghidrathon/releases |
 | Ghidra | `>= 10.3.2` | https://github.com/NationalSecurityAgency/ghidra/releases |
 | Python | `>= 3.8.0` | https://www.python.org/downloads |

-You can run capa in Ghidra by completing the following steps using the Python 3 interpreter that you have configured for your Ghidrathon installation:
+## Installation
+
+**Note**: capa + Ghidra relies on [Ghidrathon]( https://github.com/mandiant/ghidrathon) to execute Python 3 code in Ghidra. You must first install and configure Ghidrathon using the [steps outlined in its README]( https://github.com/mandiant/ghidrathon?tab=readme-ov-file#installing-ghidrathon). Then, you must use the Python 3 interpreter that you configured with Ghidrathon to complete the following steps:

 1. Install capa and its dependencies from PyPI using the following command:
 ```bash
 $ pip install flare-capa
 ```

-2. Download and extract the [official capa rules](https://github.com/mandiant/capa-rules/releases) that match the capa version you have installed. Use the following command to view the version of capa you have installed:
+2. Download and extract the [official capa rules](https://github.com/mandiant/capa-rules/releases) that match the capa version you have installed. You can use the following command to view the version of capa you have installed:
 ```bash
 $ pip show flare-capa
 OR
 $ capa --version
 ```

-3. Copy [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) to your `$USER_HOME/ghidra_scripts` directory or manually add the absolute path of each script to the Ghidra Script Manager.
+3. Copy [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) to your `ghidra_scripts` directory or manually add the parent directory of each script using Ghidra’s Script Manager.

 ## Usage

-After completing the installation steps you can execute [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using the Ghidra Script Manager. You can also execute [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using Ghidra's Headless Analyzer.
+You can execute [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using Ghidra’s Script Manager. [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) can also be executed using Ghidra's Headless Analyzer.

-### Ghidra Script Manager
+### Execution using Ghidra’s Script Manager

-Use the following steps to execute [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using Ghidra's Script Manager:
-1. Open the Ghidra Script Manager by navigating to `Window > Script Manager`
-2. Locate [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) by selecting the `Python 3 > capa` category or using the Ghidra Script Manager search functionality
-3. Double-click [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) or [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) to execute the script
+You can execute [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using Ghidra's Script Manager as follows:
+1. Navigate to `Window > Script Manager`
+2. Expand the `Python 3 > capa` category
+3. Double-click a script to execute it

-If you don't see [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) make sure you have copied these scripts to your `$USER_HOME/ghidra_scripts` directory or manually added the absolute path of each script to the Ghidra Script Manager.
+Both scripts ask you to provide the path of your capa rules directory (see installation step 2). [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) also has you choose one of `default`, `verbose`, and `vverbose` output formats which mirror the output formats of capa’s standalone tool.

-Both scripts ask you to provide the path of your capa rules directory. [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) also asks you to select `default`, `verbose`, and `vverbose` output formats used when writing output to the Ghidra Console Window.
+### Execution using Ghidra’s Headless Analyzer

-### Ghidra Headless Analyzer
+You can execute [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using Ghidra’s Headless Analyzer by invoking the `analyzeHeadless` script included with Ghidra in its `support` directory. The following arguments must be provided:

-To execute [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using the Ghidra Headless Analyzer, you can use the Ghidra `analyzeHeadless` script located in your `<ghidra_install_path>/support` directory. You will need to provide the following arguments to the Ghidra `analyzeHeadless` script:
-
-1. `<ghidra_project_path>`: path to Ghidra project
-2. `<ghidra_project_name>`: name of Ghidra Project
-3. `-process <sample_name>`: name of sample `<sample_name>`
-4. `-ScriptPath <capa_ghidra_path>`: OPTIONAL argument specifying the absolute path of [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py)
-5. `-PostScript capa_ghidra.py`: execute [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) as post-analysis script
-6. `"<capa_args>"`: single, quoted string containing capa arguments that must specify capa rules directory and output format, e.g. `"<capa_rules_path> --verbose"`. [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) supports `default`, `verbose`, `vverbose` and `json` formats when executed using the Ghidra Headless Analyzer. [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) writes output to the console window used to execute the Ghidra `analyzeHeadless` script.
+| Argument | Description |
+|----|----|
+|`<project_path>`| Path to Ghidra project|
+| `<project_name>`| Name of Ghidra Project|
+| `-Process <sample_name>` OR `-Import <sample_path>`| Name of sample `<sample_name>` already imported into `<project_name>` OR absolute path of sample `<sample_path>` to import into `<project_name>`|
+| `-ScriptPath <script_path>`| OPTIONAL parent directory `<script_path>` of [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py)|
+| `-PostScript capa_ghidra.py`| Execute [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) after Ghidra analysis|
+| `"<script_args>"`| Quoted string `"<script_args>"` containing script arguments passed to [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) that must specify a capa rules path and optionally the output format (`--verbose`, `--vverbose`, `--json`) – you can specify `”help”` to view the script’s help message |

 The following is an example of combining these arguments into a single `analyzeHeadless` script command:
-
-```
-<ghidra_install_path>/support/analyzeHeadless <ghidra_project_path> <ghidra_project_name> -process <sample_name> -PostScript capa_ghidra.py "<capa_rules_path> --verbose"
-```
-
-You may also want to run capa against a sample that you have not yet imported into your Ghidra project. The following is an example of importing a sample and running [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using a single `analyzeHeadless` script command:
-
-```
-<ghidra_install_path>/support/analyzeHeadless <ghidra_project_path> <ghidra_project_name> -Import <sample_path> -PostScript capa_ghidra.py "<capa_rules_path> --verbose"
-```
-
-You can also provide [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) the single argument `"help"` to view supported arguments when running the script using the Ghidra Headless Analyzer:
-```
-<ghidra_install_path>/support/analyzeHeadless <ghidra_project_path> <ghidra_project_name> -process <sample_name> -PostScript capa_ghidra.py "help"
-```
-
-The following is an example of running [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) against a shellcode sample using the Ghidra `analyzeHeadless` script:
-```
-$ analyzeHeadless /home/wumbo/Desktop/ghidra_projects/ capa_test -process 499c2a85f6e8142c3f48d4251c9c7cd6.raw32 -processor x86:LE:32:default -PostScript capa_ghidra.py "/home/wumbo/capa/rules -vv"
-[...]
-
-INFO  REPORT: Analysis succeeded for file: /499c2a85f6e8142c3f48d4251c9c7cd6.raw32 (HeadlessAnalyzer)  
-INFO  SCRIPT: /home/wumbo/ghidra_scripts/capa_ghidra.py (HeadlessAnalyzer)  
-md5                     499c2a85f6e8142c3f48d4251c9c7cd6                                                                                                                                                                                                    
-sha1
-sha256                  e8e02191c1b38c808d27a899ac164b3675eb5cadd3a8907b0ffa863714000e72
-path                    /home/wumbo/capa/tests/data/499c2a85f6e8142c3f48d4251c9c7cd6.raw32
-timestamp               2023-08-29 17:57:00.946588
-capa version            6.1.0
-os                      unknown os
-format                  Raw Binary
-arch                    x86
-extractor               ghidra
-base address            global
-rules                   /home/wumbo/capa/rules
-function count          42
-library function count  0
-total feature count     1970
-
-contain loop (24 matches, only showing first match of library rule)
-author  moritz.raabe@mandiant.com
-scope   function
-function @ 0x0
-  or:
-    characteristic: loop @ 0x0
-    characteristic: tight loop @ 0x278
-
-contain obfuscated stackstrings
-namespace  anti-analysis/obfuscation/string/stackstring
-author     moritz.raabe@mandiant.com
-scope      basic block
-att&ck     Defense Evasion::Obfuscated Files or Information::Indicator Removal from Tools [T1027.005]
-mbc        Anti-Static Analysis::Executable Code Obfuscation::Argument Obfuscation [B0032.020], Anti-Static Analysis::Executable Code Obfuscation::Stack Strings [B0032.017]
-basic block @ 0x0 in function 0x0
-  characteristic: stack string @ 0x0
-
-encode data using XOR
-namespace  data-manipulation/encoding/xor
-author     moritz.raabe@mandiant.com
-scope      basic block
-att&ck     Defense Evasion::Obfuscated Files or Information [T1027]
-mbc        Defense Evasion::Obfuscated Files or Information::Encoding-Standard Algorithm [E1027.m02], Data::Encode Data::XOR [C0026.002]
-basic block @ 0x8AF in function 0x8A1
-  and:
-    characteristic: tight loop @ 0x8AF
-    characteristic: nzxor @ 0x8C0
-    not: = filter for potential false positives
-      or:
-        or: = unsigned bitwise negation operation (~i)
-          number: 0xFFFFFFFF = bitwise negation for unsigned 32 bits
-          number: 0xFFFFFFFFFFFFFFFF = bitwise negation for unsigned 64 bits
-        or: = signed bitwise negation operation (~i)
-          number: 0xFFFFFFF = bitwise negation for signed 32 bits
-          number: 0xFFFFFFFFFFFFFFF = bitwise negation for signed 64 bits
-        or: = Magic constants used in the implementation of strings functions.
-          number: 0x7EFEFEFF = optimized string constant for 32 bits
-          number: 0x81010101 = -0x81010101 = 0x7EFEFEFF
-          number: 0x81010100 = 0x81010100 = ~0x7EFEFEFF
-          number: 0x7EFEFEFEFEFEFEFF = optimized string constant for 64 bits
-          number: 0x8101010101010101 = -0x8101010101010101 = 0x7EFEFEFEFEFEFEFF
-          number: 0x8101010101010100 = 0x8101010101010100 = ~0x7EFEFEFEFEFEFEFF
-
-get OS information via KUSER_SHARED_DATA
-namespace   host-interaction/os/version
-author      @mr-tz
-scope       function
-att&ck      Discovery::System Information Discovery [T1082]
-references  https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/ntexapi_x/kuser_shared_data/index.htm
-function @ 0x1CA6
-  or:
-    number: 0x7FFE026C = NtMajorVersion @ 0x1D18
-
-
-
-Script /home/wumbo/ghidra_scripts/capa_ghidra.py called exit with code 0
-
-[...]
+```bash
+$ analyzeHeadless /home/wumbo/demo demo -Import /home/wumbo/capa/tests/data/Practical\ Malware\ Analysis\ Lab\ 01-01.dll_ -PostScript capa_ghidra.py "/home/wumbo/capa/rules --verbose"
 ```
--- a/capa/ghidra/capa_explorer.py
+++ b/capa/ghidra/capa_explorer.py
@@ -1,8 +1,8 @@
-# Integrate capa results with Ghidra UI
+# Run capa against loaded Ghidra database and render results in Ghidra UI
 # @author Colton Gabertan (gabertan.colton@gmail.com)
 # @category Python 3.capa

-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/ghidra/capa_ghidra.py
+++ b/capa/ghidra/capa_ghidra.py
@@ -1,4 +1,4 @@
-# Run capa against loaded Ghidra database
+# Run capa against loaded Ghidra database and render results in Ghidra Console window
 # @author Mike Hunhoff (mehunhoff@google.com)
 # @category Python 3.capa

--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -6,6 +6,7 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import sys
+import gzip
 import json
 import inspect
 import logging
@@ -30,7 +31,7 @@ from capa.features.common import (

 EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32")
 EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64")
-EXTENSIONS_DYNAMIC = ("json", "json_")
+EXTENSIONS_DYNAMIC = ("json", "json_", "json.gz")
 EXTENSIONS_ELF = "elf_"
 EXTENSIONS_FREEZE = "frz"

@@ -70,9 +71,19 @@ def assert_never(value) -> NoReturn:
    assert False, f"Unhandled value: {value} ({type(value).__name__})"  # noqa: B011


-def get_format_from_report(sample: Path) -> str:
-    report = json.load(sample.open(encoding="utf-8"))
+def load_json_from_path(json_path: Path):
+    with gzip.open(json_path, "r") as compressed_report:
+        try:
+            report_json = compressed_report.read()
+        except gzip.BadGzipFile:
+            report = json.load(json_path.open(encoding="utf-8"))
+        else:
+            report = json.loads(report_json)
+    return report

+
+def get_format_from_report(sample: Path) -> str:
+    report = load_json_from_path(sample)
    if "CAPE" in report:
        return FORMAT_CAPE

--- a/capa/ida/helpers.py
+++ b/capa/ida/helpers.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/ida/plugin/init.py
+++ b/capa/ida/plugin/init.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -125,7 +125,7 @@ def install_icon():
        return False

    # resource leak here. need to call `ida_kernwin.free_custom_icon`?
-    # however, since we're not cycling this icon a lot, its probably ok.
+    # however, since we're not cycling this icon a lot, it's probably ok.
    # expect to leak exactly one icon per application load.
    icon = ida_kernwin.load_custom_icon(data=ICON)

--- a/capa/ida/plugin/capa_explorer.py
+++ b/capa/ida/plugin/capa_explorer.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/ida/plugin/form.py
+++ b/capa/ida/plugin/form.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/ida/plugin/hooks.py
+++ b/capa/ida/plugin/hooks.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/ida/plugin/icon.py
+++ b/capa/ida/plugin/icon.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/ida/plugin/item.py
+++ b/capa/ida/plugin/item.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/ida/plugin/model.py
+++ b/capa/ida/plugin/model.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/ida/plugin/proxy.py
+++ b/capa/ida/plugin/proxy.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/ida/plugin/view.py
+++ b/capa/ida/plugin/view.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -196,7 +196,7 @@ class CapaExplorerRulegenPreview(QtWidgets.QTextEdit):
            f"      - {author}",
            "    scopes:",
            f"      static: {scope}",
-            "      dynamic: unspecified",
+            "      dynamic: unsupported",
            "    references:",
            "      - <insert_references>",
            "    examples:",
@@ -764,7 +764,7 @@ class CapaExplorerRulegenEditor(QtWidgets.QTreeWidget):

            node = self.make_child_node_from_feature(parent, parse_yaml_line(line.strip()))

-            # append our new node in case its a parent for another node
+            # append our new node in case it's a parent for another node
            if node:
                stack.append(node)

--- a/capa/loader.py
+++ b/capa/loader.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -6,13 +6,13 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import sys
-import json
 import logging
 import datetime
+import contextlib
 from typing import Set, Dict, List, Optional
 from pathlib import Path

-import halo
+from rich.console import Console
 from typing_extensions import assert_never

 import capa.perf
@@ -31,9 +31,6 @@ import capa.features.extractors
 import capa.render.result_document
 import capa.render.result_document as rdoc
 import capa.features.extractors.common
-import capa.features.extractors.pefile
-import capa.features.extractors.elffile
-import capa.features.extractors.dotnetfile
 import capa.features.extractors.base_extractor
 import capa.features.extractors.cape.extractor
 from capa.rules import RuleSet
@@ -158,6 +155,18 @@ def get_workspace(path: Path, input_format: str, sigpaths: List[Path]):

    viv_utils.flirt.register_flirt_signature_analyzers(vw, [str(s) for s in sigpaths])

+    with contextlib.suppress(Exception):
+        # unfortuately viv raises a raw Exception (not any subclass).
+        # This happens when the module isn't found, such as with a viv upgrade.
+        #
+        # Remove the symbolic switch case solver.
+        # This is only enabled for ELF files, not PE files.
+        # During the following performance investigation, this analysis module
+        # had some terrible worst-case behavior.
+        # We can put up with slightly worse CFG reconstruction in order to avoid this.
+        # https://github.com/mandiant/capa/issues/1989#issuecomment-1948022767
+        vw.delFuncAnalysisModule("vivisect.analysis.generic.symswitchcase")
+
    vw.analyze()

    logger.debug("%s", get_meta_str(vw))
@@ -180,10 +189,14 @@ def get_extractor(
      UnsupportedArchError
      UnsupportedOSError
    """
+
+    # stderr=True is used here to redirect the spinner banner to stderr, so that users can redirect capa's output.
+    console = Console(stderr=True, quiet=disable_progress)
+
    if backend == BACKEND_CAPE:
        import capa.features.extractors.cape.extractor

-        report = json.loads(input_path.read_text(encoding="utf-8"))
+        report = capa.helpers.load_json_from_path(input_path)
        return capa.features.extractors.cape.extractor.CapeExtractor.from_report(report)

    elif backend == BACKEND_DOTNET:
@@ -226,7 +239,7 @@ def get_extractor(
            if os_ == OS_AUTO and not is_supported_os(input_path):
                raise UnsupportedOSError()

-        with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
+        with console.status("analyzing program...", spinner="dots"):
            bv: BinaryView = binaryninja.load(str(input_path))
            if bv is None:
                raise RuntimeError(f"Binary Ninja cannot open file {input_path}")
@@ -251,7 +264,7 @@ def get_extractor(
            if os_ == OS_AUTO and not is_supported_os(input_path):
                raise UnsupportedOSError()

-        with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
+        with console.status("analyzing program...", spinner="dots"):
            vw = get_workspace(input_path, input_format, sigpaths)

            if should_save_workspace:
@@ -276,18 +289,31 @@ def get_extractor(
 def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtractor]:
    file_extractors: List[FeatureExtractor] = []

+    # we use lazy importing here to avoid eagerly loading dependencies
+    # that some specialized environments may not have,
+    # e.g., those that run capa without vivisect.
+
    if input_format == FORMAT_PE:
+        import capa.features.extractors.pefile
+
        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))

    elif input_format == FORMAT_DOTNET:
+        import capa.features.extractors.pefile
+        import capa.features.extractors.dotnetfile
+
        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
        file_extractors.append(capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input_file))

    elif input_format == FORMAT_ELF:
+        import capa.features.extractors.elffile
+
        file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(input_file))

    elif input_format == FORMAT_CAPE:
-        report = json.loads(input_file.read_text(encoding="utf-8"))
+        import capa.features.extractors.cape.extractor
+
+        report = capa.helpers.load_json_from_path(input_file)
        file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))

    return file_extractors
--- a/capa/main.py
+++ b/capa/main.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -40,11 +40,6 @@ import capa.features.extractors
 import capa.render.result_document
 import capa.render.result_document as rdoc
 import capa.features.extractors.common
-import capa.features.extractors.pefile
-import capa.features.extractors.elffile
-import capa.features.extractors.dotnetfile
-import capa.features.extractors.base_extractor
-import capa.features.extractors.cape.extractor
 from capa.rules import RuleSet
 from capa.engine import MatchResults
 from capa.loader import BACKEND_VIV, BACKEND_CAPE, BACKEND_BINJA, BACKEND_DOTNET, BACKEND_FREEZE, BACKEND_PEFILE
@@ -251,7 +246,7 @@ def install_common_args(parser, wanted=None):

    if "backend" in wanted:
        backends = [
-            (BACKEND_AUTO, "(default) detect apppropriate backend automatically"),
+            (BACKEND_AUTO, "(default) detect appropriate backend automatically"),
            (BACKEND_VIV, "vivisect"),
            (BACKEND_PEFILE, "pefile (file features only)"),
            (BACKEND_BINJA, "Binary Ninja"),
@@ -322,7 +317,7 @@ def install_common_args(parser, wanted=None):
 # Library code should *not* call these functions.
 #
 # These main routines may raise `ShouldExitError` to indicate the program
-# ...should exit. Its a tiny step away from doing `sys.exit()` directly.
+# ...should exit. It's a tiny step away from doing `sys.exit()` directly.
 # I'm not sure if we should just do that. In the meantime, programs should
 # handle `ShouldExitError` and pass the status code to `sys.exit()`.
 #
@@ -343,8 +338,9 @@ def handle_common_args(args):
      - rules: file system path to rule files.
      - signatures: file system path to signature files.

-    the following field may be added:
+    the following fields may be added:
      - is_default_rules: if the default rules were used.
+      - is_default_signatures: if the default signatures were used.

    args:
      args: The parsed command line arguments from `install_common_args`.
@@ -437,25 +433,11 @@ def handle_common_args(args):

    if hasattr(args, "signatures"):
        if args.signatures == SIGNATURES_PATH_DEFAULT_STRING:
-            logger.debug("-" * 80)
-            logger.debug(" Using default embedded signatures.")
-            logger.debug(
-                " To provide your own signatures, use the form `capa.exe --signature ./path/to/signatures/  /path/to/mal.exe`."
-            )
-            logger.debug("-" * 80)
-
            sigs_path = get_default_root() / "sigs"
-
-            if not sigs_path.exists():
-                logger.error(
-                    "Using default signature path, but it doesn't exist. "  # noqa: G003 [logging statement uses +]
-                    + "Please install the signatures first: "
-                    + "https://github.com/mandiant/capa/blob/master/doc/installation.md#method-2-using-capa-as-a-python-library."
-                )
-                raise IOError(f"signatures path {sigs_path} does not exist or cannot be accessed")
+            args.is_default_signatures = True
        else:
            sigs_path = Path(args.signatures)
-            logger.debug("using signatures path: %s", sigs_path)
+            args.is_default_signatures = False

        args.signatures = sigs_path

@@ -706,6 +688,24 @@ def get_signatures_from_cli(args, input_format: str, backend: str) -> List[Path]
        logger.debug("skipping library code matching: signatures only supports PE files")
        return []

+    if args.is_default_signatures:
+        logger.debug("-" * 80)
+        logger.debug(" Using default embedded signatures.")
+        logger.debug(
+            " To provide your own signatures, use the form `capa.exe --signature ./path/to/signatures/  /path/to/mal.exe`."
+        )
+        logger.debug("-" * 80)
+
+        if not args.signatures.exists():
+            logger.error(
+                "Using default signature path, but it doesn't exist. "  # noqa: G003 [logging statement uses +]
+                + "Please install the signatures first: "
+                + "https://github.com/mandiant/capa/blob/master/doc/installation.md#method-2-using-capa-as-a-python-library."
+            )
+            raise IOError(f"signatures path {args.signatures} does not exist or cannot be accessed")
+    else:
+        logger.debug("using signatures path: %s", args.signatures)
+
    try:
        return capa.loader.get_signatures(args.signatures)
    except IOError as e:
--- a/capa/optimizer.py
+++ b/capa/optimizer.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/perf.py
+++ b/capa/perf.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/render/default.py
+++ b/capa/render/default.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -102,7 +102,11 @@ def render_capabilities(doc: rd.ResultDocument, ostream: StringIO):

    if rows:
        ostream.write(
-            tabulate.tabulate(rows, headers=[width("Capability", 50), width("Namespace", 50)], tablefmt="mixed_outline")
+            tabulate.tabulate(
+                rows,
+                headers=[width("Capability", 50), width("Namespace", 50)],
+                tablefmt="mixed_outline",
+            )
        )
        ostream.write("\n")
    else:
@@ -148,7 +152,55 @@ def render_attack(doc: rd.ResultDocument, ostream: StringIO):
    if rows:
        ostream.write(
            tabulate.tabulate(
-                rows, headers=[width("ATT&CK Tactic", 20), width("ATT&CK Technique", 80)], tablefmt="mixed_grid"
+                rows,
+                headers=[width("ATT&CK Tactic", 20), width("ATT&CK Technique", 80)],
+                tablefmt="mixed_grid",
+            )
+        )
+        ostream.write("\n")
+
+
+def render_maec(doc: rd.ResultDocument, ostream: StringIO):
+    """
+    example::
+
+        +--------------------------+-----------------------------------------------------------+
+        | MAEC Category            | MAEC Value                                                |
+        |--------------------------+-----------------------------------------------------------|
+        | analysis-conclusion      | malicious                                                 |
+        |--------------------------+-----------------------------------------------------------|
+        | malware-family           | PlugX                                                     |
+        |--------------------------+-----------------------------------------------------------|
+        | malware-category         | downloader                                                |
+        |                          | launcher                                                  |
+        +--------------------------+-----------------------------------------------------------+
+    """
+    maec_categories = {
+        "analysis_conclusion",
+        "analysis_conclusion_ov",
+        "malware_family",
+        "malware_category",
+        "malware_category_ov",
+    }
+    maec_table = collections.defaultdict(set)
+    for rule in rutils.maec_rules(doc):
+        for maec_category in maec_categories:
+            maec_value = getattr(rule.meta.maec, maec_category, None)
+            if maec_value:
+                maec_table[maec_category].add(maec_value)
+
+    rows = []
+    for category in sorted(maec_categories):
+        values = maec_table.get(category, set())
+        if values:
+            rows.append((rutils.bold(category.replace("_", "-")), "\n".join(sorted(values))))
+
+    if rows:
+        ostream.write(
+            tabulate.tabulate(
+                rows,
+                headers=[width("MAEC Category", 25), width("MAEC Value", 75)],
+                tablefmt="mixed_grid",
            )
        )
        ostream.write("\n")
@@ -191,7 +243,9 @@ def render_mbc(doc: rd.ResultDocument, ostream: StringIO):
    if rows:
        ostream.write(
            tabulate.tabulate(
-                rows, headers=[width("MBC Objective", 25), width("MBC Behavior", 75)], tablefmt="mixed_grid"
+                rows,
+                headers=[width("MBC Objective", 25), width("MBC Behavior", 75)],
+                tablefmt="mixed_grid",
            )
        )
        ostream.write("\n")
@@ -204,6 +258,8 @@ def render_default(doc: rd.ResultDocument):
    ostream.write("\n")
    render_attack(doc, ostream)
    ostream.write("\n")
+    render_maec(doc, ostream)
+    ostream.write("\n")
    render_mbc(doc, ostream)
    ostream.write("\n")
    render_capabilities(doc, ostream)
--- a/capa/render/json.py
+++ b/capa/render/json.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/render/result_document.py
+++ b/capa/render/result_document.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -306,7 +306,7 @@ class Match(FrozenModel):
    args:
      success: did the node match?
      node: the logic node or feature node.
-      children: any children of the logic node. not relevent for features, can be empty.
+      children: any children of the logic node. not relevant for features, can be empty.
      locations: where the feature matched. not relevant for logic nodes (except range), can be empty.
      captures: captured values from the string/regex feature, and the locations of those values.
    """
@@ -418,7 +418,7 @@ class Match(FrozenModel):
                            # doc[locations] contains all matches for the given namespace.
                            # for example, the feature might be `match: anti-analysis/packer`
                            # which matches against "generic unpacker" and "UPX".
-                            # in this case, doc[locations] contains locations for *both* of thse.
+                            # in this case, doc[locations] contains locations for *both* of those.
                            #
                            # rule_matches contains the matches for the specific rule.
                            # this is a subset of doc[locations].
--- a/capa/render/utils.py
+++ b/capa/render/utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -7,7 +7,7 @@
 # See the License for the specific language governing permissions and limitations under the License.

 import io
-from typing import Union, Iterator
+from typing import Dict, List, Tuple, Union, Iterator, Optional

 import termcolor

@@ -40,9 +40,14 @@ def format_parts_id(data: Union[rd.AttackSpec, rd.MBCSpec]):
    return f"{'::'.join(data.parts)} [{data.id}]"


+def sort_rules(rules: Dict[str, rd.RuleMatches]) -> List[Tuple[Optional[str], str, rd.RuleMatches]]:
+    """Sort rules by namespace and name."""
+    return sorted((rule.meta.namespace or "", rule.meta.name, rule) for rule in rules.values())
+
+
 def capability_rules(doc: rd.ResultDocument) -> Iterator[rd.RuleMatches]:
    """enumerate the rules in (namespace, name) order that are 'capability' rules (not lib/subscope/disposition/etc)."""
-    for _, _, rule in sorted((rule.meta.namespace or "", rule.meta.name, rule) for rule in doc.rules.values()):
+    for _, _, rule in sort_rules(doc.rules):
        if rule.meta.lib:
            continue
        if rule.meta.is_subscope_rule:
@@ -61,6 +66,21 @@ def capability_rules(doc: rd.ResultDocument) -> Iterator[rd.RuleMatches]:
        yield rule


+def maec_rules(doc: rd.ResultDocument) -> Iterator[rd.RuleMatches]:
+    """enumerate 'maec' rules."""
+    for rule in doc.rules.values():
+        if any(
+            [
+                rule.meta.maec.analysis_conclusion,
+                rule.meta.maec.analysis_conclusion_ov,
+                rule.meta.maec.malware_family,
+                rule.meta.maec.malware_category,
+                rule.meta.maec.malware_category_ov,
+            ]
+        ):
+            yield rule
+
+
 class StringIO(io.StringIO):
    def writeln(self, s):
        self.write(s)
--- a/capa/render/verbose.py
+++ b/capa/render/verbose.py
@@ -14,7 +14,7 @@ example::
                 0x10003415
                 0x10003797

-Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at: [package root]/LICENSE.txt
--- a/capa/render/vverbose.py
+++ b/capa/render/vverbose.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
@@ -48,7 +48,7 @@ def hanging_indent(s: str, indent: int) -> str:
 def render_locations(ostream, layout: rd.Layout, locations: Iterable[frz.Address], indent: int):
    import capa.render.verbose as v

-    # its possible to have an empty locations array here,
+    # it's possible to have an empty locations array here,
    # such as when we're in MODE_FAILURE and showing the logic
    # under a `not` statement (which will have no matched locations).
    locations = sorted(locations)
--- a/capa/rules/init.py
+++ b/capa/rules/init.py
@@ -9,6 +9,7 @@
 import io
 import os
 import re
+import copy
 import uuid
 import codecs
 import logging
@@ -26,12 +27,11 @@ except ImportError:
    # https://github.com/python/mypy/issues/1153
    from backports.functools_lru_cache import lru_cache  # type: ignore

-from typing import Any, Set, Dict, List, Tuple, Union, Callable, Iterator, Optional
+from typing import Any, Set, Dict, List, Tuple, Union, Callable, Iterator, Optional, cast
 from dataclasses import asdict, dataclass

 import yaml
 import pydantic
-import ruamel.yaml
 import yaml.parser

 import capa.perf
@@ -153,14 +153,6 @@ class Scopes:
        if scopes_["dynamic"] == "unsupported":
            scopes_["dynamic"] = None

-        # unspecified is used to indicate a rule is yet to be migrated.
-        # TODO(williballenthin): this scope term should be removed once all rules have been migrated.
-        # https://github.com/mandiant/capa/issues/1747
-        if scopes_["static"] == "unspecified":
-            scopes_["static"] = None
-        if scopes_["dynamic"] == "unspecified":
-            scopes_["dynamic"] = None
-
        if (not scopes_["static"]) and (not scopes_["dynamic"]):
            raise InvalidRule("invalid scopes value. At least one scope must be specified")

@@ -850,7 +842,7 @@ class Rule:
        """
        fetch the names of rules this rule relies upon.
        these are only the direct dependencies; a user must
-         compute the transitive dependency graph themself, if they want it.
+        compute the transitive dependency graph themself, if they want it.

        Args:
          namespaces(Dict[str, List[Rule]]): mapping from namespace name to rules in it.
@@ -868,14 +860,14 @@ class Rule:
                # we'll give precedence to namespaces, and then assume if that does work,
                #  that it must be a rule name.
                #
-                # we don't expect any collisions between namespaces and rule names, but its possible.
+                # we don't expect any collisions between namespaces and rule names, but it's possible.
                # most likely would be collision between top level namespace (e.g. `host-interaction`) and rule name.
                # but, namespaces tend to use `-` while rule names use ` `. so, unlikely, but possible.
                if statement.value in namespaces:
                    # matches a namespace, so take precedence and don't even check rule names.
                    deps.update(r.name for r in namespaces[statement.value])
                else:
-                    # not a namespace, assume its a rule name.
+                    # not a namespace, assume it's a rule name.
                    assert isinstance(statement.value, str)
                    deps.add(statement.value)

@@ -1053,8 +1045,12 @@ class Rule:

    @staticmethod
    def _get_ruamel_yaml_parser():
-        # use ruamel to enable nice formatting
+        # we use lazy importing here to avoid eagerly loading dependencies
+        # that some specialized environments may not have,
+        # e.g., those that run capa without ruamel.
+        import ruamel.yaml

+        # use ruamel to enable nice formatting
        # we use the ruamel.yaml parser because it supports roundtripping of documents with comments.
        y = ruamel.yaml.YAML(typ="rt")

@@ -1221,16 +1217,22 @@ def get_rules_and_dependencies(rules: List[Rule], rule_name: str) -> Iterator[Ru
    """
    from the given collection of rules, select a rule and its dependencies (transitively).
    """
-    # we evaluate `rules` multiple times, so if its a generator, realize it into a list.
+    # we evaluate `rules` multiple times, so if it's a generator, realize it into a list.
    rules = list(rules)
    namespaces = index_rules_by_namespace(rules)
    rules_by_name = {rule.name: rule for rule in rules}
    wanted = {rule_name}
+    visited = set()

-    def rec(rule):
+    def rec(rule: Rule):
        wanted.add(rule.name)
+        visited.add(rule.name)
+
        for dep in rule.get_dependencies(namespaces):
+            if dep in visited:
+                raise InvalidRule(f'rule "{dep}" has a circular dependency')
            rec(rules_by_name[dep])
+        visited.remove(rule.name)

    rec(rules_by_name[rule_name])

@@ -1254,7 +1256,7 @@ def ensure_rule_dependencies_are_met(rules: List[Rule]) -> None:
    raises:
      InvalidRule: if a dependency is not met.
    """
-    # we evaluate `rules` multiple times, so if its a generator, realize it into a list.
+    # we evaluate `rules` multiple times, so if it's a generator, realize it into a list.
    rules = list(rules)
    namespaces = index_rules_by_namespace(rules)
    rules_by_name = {rule.name: rule for rule in rules}
@@ -1301,7 +1303,7 @@ def topologically_order_rules(rules: List[Rule]) -> List[Rule]:

    assumes that the rule dependency graph is a DAG.
    """
-    # we evaluate `rules` multiple times, so if its a generator, realize it into a list.
+    # we evaluate `rules` multiple times, so if it's a generator, realize it into a list.
    rules = list(rules)
    namespaces = index_rules_by_namespace(rules)
    rules_by_name = {rule.name: rule for rule in rules}
@@ -1364,32 +1366,53 @@ class RuleSet:

        rules = capa.optimizer.optimize_rules(rules)

-        self.file_rules = self._get_rules_for_scope(rules, Scope.FILE)
-        self.process_rules = self._get_rules_for_scope(rules, Scope.PROCESS)
-        self.thread_rules = self._get_rules_for_scope(rules, Scope.THREAD)
-        self.call_rules = self._get_rules_for_scope(rules, Scope.CALL)
-        self.function_rules = self._get_rules_for_scope(rules, Scope.FUNCTION)
-        self.basic_block_rules = self._get_rules_for_scope(rules, Scope.BASIC_BLOCK)
-        self.instruction_rules = self._get_rules_for_scope(rules, Scope.INSTRUCTION)
+        scopes = (
+            Scope.CALL,
+            Scope.THREAD,
+            Scope.PROCESS,
+            Scope.INSTRUCTION,
+            Scope.BASIC_BLOCK,
+            Scope.FUNCTION,
+            Scope.FILE,
+        )
+
        self.rules = {rule.name: rule for rule in rules}
        self.rules_by_namespace = index_rules_by_namespace(rules)
+        self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes}

-        # unstable
-        (self._easy_file_rules_by_feature, self._hard_file_rules) = self._index_rules_by_feature(self.file_rules)
-        (self._easy_process_rules_by_feature, self._hard_process_rules) = self._index_rules_by_feature(
-            self.process_rules
-        )
-        (self._easy_thread_rules_by_feature, self._hard_thread_rules) = self._index_rules_by_feature(self.thread_rules)
-        (self._easy_call_rules_by_feature, self._hard_call_rules) = self._index_rules_by_feature(self.call_rules)
-        (self._easy_function_rules_by_feature, self._hard_function_rules) = self._index_rules_by_feature(
-            self.function_rules
-        )
-        (self._easy_basic_block_rules_by_feature, self._hard_basic_block_rules) = self._index_rules_by_feature(
-            self.basic_block_rules
-        )
-        (self._easy_instruction_rules_by_feature, self._hard_instruction_rules) = self._index_rules_by_feature(
-            self.instruction_rules
-        )
+        # these structures are unstable and may change before the next major release.
+        scores_by_rule: Dict[str, int] = {}
+        self._feature_indexes_by_scopes = {
+            scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes
+        }
+
+    @property
+    def file_rules(self):
+        return self.rules_by_scope[Scope.FILE]
+
+    @property
+    def process_rules(self):
+        return self.rules_by_scope[Scope.PROCESS]
+
+    @property
+    def thread_rules(self):
+        return self.rules_by_scope[Scope.THREAD]
+
+    @property
+    def call_rules(self):
+        return self.rules_by_scope[Scope.CALL]
+
+    @property
+    def function_rules(self):
+        return self.rules_by_scope[Scope.FUNCTION]
+
+    @property
+    def basic_block_rules(self):
+        return self.rules_by_scope[Scope.BASIC_BLOCK]
+
+    @property
+    def instruction_rules(self):
+        return self.rules_by_scope[Scope.INSTRUCTION]

    def __len__(self):
        return len(self.rules)
@@ -1400,154 +1423,358 @@ class RuleSet:
    def __contains__(self, rulename):
        return rulename in self.rules

+    # this routine is unstable and may change before the next major release.
    @staticmethod
-    def _index_rules_by_feature(rules) -> Tuple[Dict[Feature, Set[str]], List[str]]:
+    def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int:
        """
-        split the given rules into two structures:
-          - "easy rules" are indexed by feature,
-            such that you can quickly find the rules that contain a given feature.
-          - "hard rules" are those that contain substring/regex/bytes features or match statements.
-            these continue to be ordered topologically.
+        Score the given feature by how "uncommon" we think it will be.
+        Features that we expect to be very selective (ie. uniquely identify a rule and be required to match),
+         or "uncommon", should get a high score.
+        Features that are not good for indexing will have a low score, or 0.

-        a rule evaluator can use the "easy rule" index to restrict the
-        candidate rules that might match a given set of features.
+        The range of values doesn't really matter, but here we use 0-10, where
+          - 10 is very uncommon, very selective, good for indexing a rule, and
+          - 0 is a very common, not selective, bad for indexing a rule.

-        at this time, a rule evaluator can't do anything special with
-        the "hard rules". it must still do a full top-down match of each
-        rule, in topological order.
+        You shouldn't try to interpret the scores, beyond to compare features to pick one or the other.

-        this does not index global features, because these are not selective, and
-        won't be used as the sole feature used to match.
+        Today, these scores are assigned manually, by the capa devs, who use their intuition and experience.
+        We *could* do a large scale analysis of all features emitted by capa across many samples to
+        make this more data driven. If the current approach doesn't work well, consider that.
        """

-        # we'll do a couple phases:
        #
-        #  1. recursively visit all nodes in all rules,
-        #    a. indexing all features
-        #    b. recording the types of features found per rule
-        #  2. compute the easy and hard rule sets
-        #  3. remove hard rules from the rules-by-feature index
-        #  4. construct the topologically ordered list of hard rules
-        rules_with_easy_features: Set[str] = set()
-        rules_with_hard_features: Set[str] = set()
+        # Today, these scores are manually assigned by intuition/experience/guesswork.
+        # We could do a large-scale feature collection and use the results to assign scores.
+        #
+
+        if isinstance(
+            node,
+            capa.features.common.MatchedRule,
+        ):
+            # The other rule must match before this one, in same scope or smaller.
+            # Because we process the rules small->large scope and topologically,
+            # then we can rely on dependencies being processed first.
+            #
+            # If logic changes and you see issues here, ensure that `scores_by_rule` is correctly provided.
+            rule_name = node.value
+            assert isinstance(rule_name, str)
+
+            if rule_name not in scores_by_rule:
+                # Its possible that we haven't scored the rule that is being requested here.
+                # This means that it won't ever match (because it won't be evaluated before this one).
+                # Still, we need to provide a default value here.
+                # So we give it 9, because it won't match, so its very selective.
+                #
+                # But how could this dependency not exist?
+                # Consider a rule that supports both static and dynamic analysis, but also has
+                # a `instruction: ` block. This block gets translated into a derived rule that only
+                # matches in static mode. Therefore, when the parent rule is run in dynamic mode, it
+                # won't be able to find the derived rule. This is the case we have to handle here.
+                #
+                # A better solution would be to prune this logic based on static/dynamic mode, but
+                # that takes more work and isn't in scope of this feature.
+                #
+                # See discussion in: https://github.com/mandiant/capa/pull/2080/#discussion_r1624783396
+                return 9
+
+            return scores_by_rule[rule_name]
+
+        elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)):
+            v = node.value
+            assert isinstance(v, int)
+
+            if -0x8000 <= v <= 0x8000:
+                # Small numbers are probably pretty common, like structure offsets, etc.
+                return 3
+
+            if 0xFFFF_FF00 <= v <= 0xFFFF_FFFF:
+                # Numbers close to u32::max_int are also probably pretty common,
+                # like signed numbers close to 0 that are stored as unsigned ints.
+                return 3
+
+            if 0xFFFF_FFFF_FFFF_FF00 <= v <= 0xFFFF_FFFF_FFFF_FFFF:
+                # Like signed numbers closed to 0 that are stored as unsigned long ints.
+                return 3
+
+            # Other numbers are assumed to be uncommon.
+            return 7
+
+        elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes)):
+            # Scanning features (non-hashable), which we can't use for quick matching/filtering.
+            return 0
+
+        C = node.__class__
+        return {
+            # The range of values doesn't really matter, but here we use 0-10, where
+            #   - 10 is very uncommon, very selective, good for indexing a rule, and
+            #   - 0 is a very common, not selective, bad for indexing a rule.
+            #
+            # You shouldn't try to interpret the scores, beyond to compare features to pick one or the other.
+            # -----------------------------------------------------------------
+            #
+            # Very uncommon features that are probably very selective in capa's domain.
+            # When possible, we want rules to be indexed by these features.
+            #
+            capa.features.common.String: 9,
+            capa.features.insn.API: 8,
+            capa.features.file.Export: 7,
+            # "uncommon numbers": 7 (placeholder for logic above)
+            #
+            # -----------------------------------------------------------------
+            #
+            # Features that are probably somewhat common, and/or rarely used within capa.
+            # Its ok to index rules by these.
+            #
+            capa.features.common.Class: 5,
+            capa.features.common.Namespace: 5,
+            capa.features.insn.Property: 5,
+            capa.features.file.Import: 5,
+            capa.features.file.Section: 5,
+            capa.features.file.FunctionName: 5,
+            #
+            # -----------------------------------------------------------------
+            #
+            # Features that are pretty common and we'd prefer not to index, but can if we have to.
+            #
+            capa.features.common.Characteristic: 4,
+            capa.features.insn.Offset: 4,
+            capa.features.insn.OperandOffset: 4,
+            # "common numbers": 3 (placeholder for logic above)
+            #
+            # -----------------------------------------------------------------
+            #
+            # Very common features, which we'd only prefer instead of non-hashable features, like Regex/Substring/Bytes.
+            #
+            capa.features.insn.Mnemonic: 2,
+            capa.features.basicblock.BasicBlock: 1,
+            #
+            #
+            # We don't *want* to index global features because they're not very selective.
+            # They also don't usually stand on their own - there's always some other logic.
+            #
+            capa.features.common.OS: 0,
+            capa.features.common.Arch: 0,
+            capa.features.common.Format: 0,
+            # -----------------------------------------------------------------
+            #
+            # Non-hashable features, which will require a scan to evaluate, and are therefore quite expensive.
+            #
+            # substring: 0 (placeholder for logic above)
+            # regex: 0 (placeholder for logic above)
+            # bytes: 0 (placeholder for logic above)
+        }[C]
+
+    # this class is unstable and may change before the next major release.
+    @dataclass
+    class _RuleFeatureIndex:
+        # Mapping from hashable feature to a list of rules that might have this feature.
+        rules_by_feature: Dict[Feature, Set[str]]
+        # Mapping from rule name to list of Regex/Substring features that have to match.
+        # All these features will be evaluated whenever a String feature is encountered.
+        string_rules: Dict[str, List[Feature]]
+        # Mapping from rule name to list of Bytes features that have to match.
+        # All these features will be evaluated whenever a Bytes feature is encountered.
+        bytes_rules: Dict[str, List[Feature]]
+
+    # this routine is unstable and may change before the next major release.
+    @staticmethod
+    def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dict[str, int]) -> _RuleFeatureIndex:
+        """
+        Index the given rules by their minimal set of most "uncommon" features required to match.
+
+        If absolutely necessary, provide the Regex/Substring/Bytes features
+        (which are not hashable and require a scan) that have to match, too.
+        """
+
        rules_by_feature: Dict[Feature, Set[str]] = collections.defaultdict(set)

-        def rec(rule_name: str, node: Union[Feature, Statement]):
+        def rec(
+            rule_name: str,
+            node: Union[Feature, Statement],
+        ) -> Optional[Tuple[int, Set[Feature]]]:
            """
-            walk through a rule's logic tree, indexing the easy and hard rules,
-            and the features referenced by easy rules.
+            Walk through a rule's logic tree, picking the features to use for indexing,
+            returning the feature and an associated score.
+            The higher the score, the more selective the feature is expected to be.
+            The score is only used internally, to pick the best feature from within AND blocks.
+
+            Note closure over `scores_by_rule`.
            """
-            if isinstance(
-                node,
-                (
-                    # these are the "hard features"
-                    # substring: scanning feature
-                    capa.features.common.Substring,
-                    # regex: scanning feature
-                    capa.features.common.Regex,
-                    # bytes: scanning feature
-                    capa.features.common.Bytes,
-                    # match: dependency on another rule,
-                    # which we have to evaluate first,
-                    # and is therefore tricky.
-                    capa.features.common.MatchedRule,
-                ),
-            ):
-                # hard feature: requires scan or match lookup
-                rules_with_hard_features.add(rule_name)
-            elif isinstance(node, capa.features.common.Feature):
-                if capa.features.common.is_global_feature(node):
-                    # we don't want to index global features
-                    # because they're not very selective.
-                    #
-                    # they're global, so if they match at one location in a file,
-                    # they'll match at every location in a file.
-                    # so thats not helpful to decide how to downselect.
-                    #
-                    # and, a global rule will never be the sole selector in a rule.
-                    pass
-                else:
-                    # easy feature: hash lookup
-                    rules_with_easy_features.add(rule_name)
-                    rules_by_feature[node].add(rule_name)
-            elif isinstance(node, (ceng.Not)):
-                # `not:` statements are tricky to deal with.
+
+            if isinstance(node, (ceng.Not)):
+                # We don't index features within NOT blocks, because we're only looking for
+                # features that should be present.
                #
-                # first, features found under a `not:` should not be indexed,
-                # because they're not wanted to be found.
-                # second, `not:` can be nested under another `not:`, or two, etc.
-                # third, `not:` at the root or directly under an `or:`
-                # means the rule will match against *anything* not specified there,
-                # which is a difficult set of things to compute and index.
-                #
-                # so, if a rule has a `not:` statement, its hard.
-                # as of writing, this is an uncommon statement, with only 6 instances in 740 rules.
-                rules_with_hard_features.add(rule_name)
+                # Technically we could have a rule that does `not: not: foo` and we'd want to
+                # index `foo`. But this is not seen today.
+                return None
+
            elif isinstance(node, (ceng.Some)) and node.count == 0:
-                # `optional:` and `0 or more:` are tricky to deal with.
-                #
-                # when a subtree is optional, it may match, but not matching
+                # When a subtree is optional, it may match, but not matching
                # doesn't have any impact either.
-                # now, our rule authors *should* not put this under `or:`
+                # Now, our rule authors *should* not put this under `or:`
                # and this is checked by the linter,
-                # but this could still happen (e.g. private rule set without linting)
-                # and would be hard to trace down.
-                #
-                # so better to be safe than sorry and consider this a hard case.
-                rules_with_hard_features.add(rule_name)
-            elif isinstance(node, (ceng.Range)) and node.min == 0:
-                # `count(foo): 0 or more` are tricky to deal with.
-                # because the min is 0,
-                # this subtree *can* match just about any feature
-                # (except the given one)
-                # which is a difficult set of things to compute and index.
-                rules_with_hard_features.add(rule_name)
+                return None
+
+            elif isinstance(node, (ceng.Range)) and node.min == 0 and node.max != 0:
+                # `count(foo): 0 or more` is just like an optional block,
+                # because the min is 0, this subtree *can* match just about any feature.
+                return None
+
+            elif isinstance(node, (ceng.Range)) and node.min == 0 and node.max == 0:
+                # `count(foo): 0` is like a not block, which we don't index.
+                return None
+
+            elif isinstance(node, capa.features.common.Feature):
+                return (RuleSet._score_feature(scores_by_rule, node), {node})
+
            elif isinstance(node, (ceng.Range)):
-                rec(rule_name, node.child)
-            elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)):
+                # feature is found N times
+                return rec(rule_name, node.child)
+
+            elif isinstance(node, ceng.And):
+                # When evaluating an AND block, all of the children need to match.
+                #
+                # So when we index rules, we want to pick the most uncommon feature(s)
+                # for each AND block. If the AND block matches, that feature must be there.
+                # We recursively explore children, computing their
+                # score, and pick the child with the greatest score.
+                #
+                # For example, given the rule:
+                #
+                #     and:
+                #       - mnemonic: mov
+                #       - api: CreateFile
+                #
+                # we prefer to pick `api: CreateFile` because we expect it to be more uncommon.
+                #
+                # Note that the children nodes might be complex, like:
+                #
+                #     and:
+                #       - mnemonic: mov
+                #       - or:
+                #         - api: CreateFile
+                #         - api: DeleteFile
+                #
+                # In this case, we prefer to pick the pair of API features since each is expected
+                # to be more common than the mnemonic.
+                scores: List[Tuple[int, Set[Feature]]] = []
                for child in node.children:
-                    rec(rule_name, child)
-            elif isinstance(node, ceng.Statement):
-                # unhandled type of statement.
-                # this should only happen if a new subtype of `Statement`
-                # has since been added to capa.
+                    score = rec(rule_name, child)
+
+                    if not score:
+                        # maybe an optional block or similar
+                        continue
+
+                    scores.append(score)
+
+                # otherwise we can't index this rule
+                assert len(scores) > 0
+
+                def and_score_key(item):
+                    # order by score, then fewest number of features.
+                    score, features = item
+                    return (score, -len(features))
+
+                scores.sort(key=and_score_key, reverse=True)
+
+                # pick the best feature
+                return scores[0]
+
+            elif isinstance(node, (ceng.Or, ceng.Some)):
+                # When evaluating an OR block, any of the children need to match.
+                # It could be any of them, so we can't decide to only index some of them.
                #
-                # ideally, we'd like to use mypy for exhaustiveness checking
-                # for all the subtypes of `Statement`.
-                # but, as far as i can tell, mypy does not support this type
-                # of checking.
+                # For example, given the rule:
                #
-                # in a way, this makes some intuitive sense:
-                # the set of subtypes of type A is unbounded,
-                # because any user might come along and create a new subtype B,
-                # so mypy can't reason about this set of types.
-                assert_never(node)
+                #     or:
+                #       - mnemonic: mov
+                #       - api: CreateFile
+                #
+                # we have to pick both `mnemonic` and `api` features.
+                #
+                # Note that the children nodes might be complex, like:
+                #
+                #     or:
+                #       - mnemonic: mov
+                #       - and:
+                #         - api: CreateFile
+                #         - api: DeleteFile
+                #
+                # In this case, we have to pick both the `mnemonic` and one of the `api` features.
+                #
+                # When computing the score of an OR branch, we have to use the min value encountered.
+                # While many of the children might be very specific, there might be a branch that is common
+                # and we need to handle that correctly.
+                min_score = 10000000  # assume this is larger than any score
+                features = set()
+
+                for child in node.children:
+                    item = rec(rule_name, child)
+                    assert item is not None, "can't index OR branch"
+
+                    _score, _features = item
+                    min_score = min(min_score, _score)
+                    features.update(_features)
+
+                return min_score, features
+
            else:
                # programming error
                assert_never(node)

+        # These are the Regex/Substring/Bytes features that we have to use for filtering.
+        # Ideally we find a way to get rid of all of these, eventually.
+        string_rules: Dict[str, List[Feature]] = {}
+        bytes_rules: Dict[str, List[Feature]] = {}
+
        for rule in rules:
            rule_name = rule.meta["name"]
+
            root = rule.statement
-            rec(rule_name, root)
+            item = rec(rule_name, root)
+            assert item is not None
+            score, features = item

-        # if a rule has a hard feature,
-        # dont consider it easy, and therefore,
-        # don't index any of its features.
-        #
-        # otherwise, its an easy rule, and index its features
-        for rules_with_feature in rules_by_feature.values():
-            rules_with_feature.difference_update(rules_with_hard_features)
-        easy_rules_by_feature = rules_by_feature
+            string_features = [
+                feature
+                for feature in features
+                if isinstance(feature, (capa.features.common.Substring, capa.features.common.Regex))
+            ]
+            bytes_features = [feature for feature in features if isinstance(feature, capa.features.common.Bytes)]
+            hashable_features = [
+                feature
+                for feature in features
+                if not isinstance(
+                    feature, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes)
+                )
+            ]

-        # `rules` is already topologically ordered,
-        # so extract our hard set into the topological ordering.
-        hard_rules = []
-        for rule in rules:
-            if rule.meta["name"] in rules_with_hard_features:
-                hard_rules.append(rule.meta["name"])
+            logger.debug("indexing: features: %d, score: %d, rule: %s", len(features), score, rule_name)
+            scores_by_rule[rule_name] = score
+            for feature in features:
+                logger.debug("        : [%d] %s", RuleSet._score_feature(scores_by_rule, feature), feature)

-        return (easy_rules_by_feature, hard_rules)
+            if string_features:
+                string_rules[rule_name] = cast(List[Feature], string_features)
+
+            if bytes_features:
+                bytes_rules[rule_name] = cast(List[Feature], bytes_features)
+
+            for feature in hashable_features:
+                rules_by_feature[feature].add(rule_name)
+
+        logger.debug("indexing: %d features indexed for scope %s", len(rules_by_feature), scope)
+        logger.debug(
+            "indexing: %d indexed features are shared by more than 3 rules",
+            len([feature for feature, rules in rules_by_feature.items() if len(rules) > 3]),
+        )
+        logger.debug(
+            "indexing: %d scanning string features, %d scanning bytes features", len(string_rules), len(bytes_rules)
+        )
+
+        return RuleSet._RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules)

    @staticmethod
    def _get_rules_for_scope(rules, scope) -> List[Rule]:
@@ -1599,7 +1826,6 @@ class RuleSet:
        apply tag-based rule filter assuming that all required rules are loaded
        can be used to specify selected rules vs. providing a rules child directory where capa cannot resolve
        dependencies from unknown paths
-        TODO handle circular dependencies?
        TODO support -t=metafield <k>
        """
        rules = list(self.rules.values())
@@ -1618,80 +1844,226 @@ class RuleSet:
                            break
        return RuleSet(list(rules_filtered))

-    def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]:
+    # this routine is unstable and may change before the next major release.
+    @staticmethod
+    def _sort_rules_by_index(rule_index_by_rule_name: Dict[str, int], rules: List[Rule]):
        """
-        match rules from this ruleset at the given scope against the given features.
-
-        this routine should act just like `capa.engine.match`,
-        except that it may be more performant.
+        Sort (in place) the given rules by their index provided by the given Dict.
+        This mapping is intended to represent the topologic index of the given rule;
+         that is, rules with a lower index should be evaluated first, since their dependencies
+         will be evaluated later.
        """
-        easy_rules_by_feature = {}
-        if scope == Scope.FILE:
-            easy_rules_by_feature = self._easy_file_rules_by_feature
-            hard_rule_names = self._hard_file_rules
-        elif scope == Scope.PROCESS:
-            easy_rules_by_feature = self._easy_process_rules_by_feature
-            hard_rule_names = self._hard_process_rules
-        elif scope == Scope.THREAD:
-            easy_rules_by_feature = self._easy_thread_rules_by_feature
-            hard_rule_names = self._hard_thread_rules
-        elif scope == Scope.CALL:
-            easy_rules_by_feature = self._easy_call_rules_by_feature
-            hard_rule_names = self._hard_call_rules
-        elif scope == Scope.FUNCTION:
-            easy_rules_by_feature = self._easy_function_rules_by_feature
-            hard_rule_names = self._hard_function_rules
-        elif scope == Scope.BASIC_BLOCK:
-            easy_rules_by_feature = self._easy_basic_block_rules_by_feature
-            hard_rule_names = self._hard_basic_block_rules
-        elif scope == Scope.INSTRUCTION:
-            easy_rules_by_feature = self._easy_instruction_rules_by_feature
-            hard_rule_names = self._hard_instruction_rules
-        else:
-            assert_never(scope)
+        rules.sort(key=lambda r: rule_index_by_rule_name[r.name])

-        candidate_rule_names = set()
+    def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]:
+        """
+        Match rules from this ruleset at the given scope against the given features.
+
+        This routine should act just like `capa.engine.match`, except that it may be more performant.
+        It uses its knowledge of all the rules to evaluate a minimal set of candidate rules for the given features.
+        """
+
+        feature_index: RuleSet._RuleFeatureIndex = self._feature_indexes_by_scopes[scope]
+        rules: List[Rule] = self.rules_by_scope[scope]
+        # Topologic location of rule given its name.
+        # That is, rules with a lower index should be evaluated first, since their dependencies
+        # will be evaluated later.
+        rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)}
+
+        # This algorithm is optimized to evaluate as few rules as possible,
+        # because the less work we do, the faster capa can run.
+        #
+        # It relies on the observation that most rules don't match,
+        # and that most rules have an uncommon feature that *must* be present for the rule to match.
+        #
+        # Therefore, we record which uncommon feature(s) is required for each rule to match,
+        # and then only inspect these few candidates when a feature is seen in some scope.
+        # Ultimately, the exact same rules are matched with precisely the same results,
+        # its just done faster, because we ignore most of the rules that never would have matched anyways.
+        #
+        # In `_index_rules_by_feature`, we do the hard work of computing the minimal set of
+        # uncommon features for each rule. While its a little expensive, its a single pass
+        # that gets reused at every scope instance (read: thousands or millions of times).
+        #
+        # In the current routine, we collect all the rules that might match, given the presence
+        # of any uncommon feature. We sort the rules topographically, so that rule dependencies work out,
+        # and then we evaluate the candidate rules. In practice, this saves 20-50x the work!
+        #
+        # Recall that some features cannot be matched quickly via hash lookup: Regex, Bytes, etc.
+        # When these features are the uncommon features used to filter rules, we have to evaluate the
+        # feature frequently whenever a string/bytes feature is encountered. Its slow, but we can't
+        # get around it. Reducing our reliance on regex/bytes feature and/or finding a way to
+        # index these can futher improve performance.
+        #
+        # See the corresponding unstable tests in `test_match.py::test_index_features_*`.
+
+        # Find all the rules that could match the given feature set.
+        # Ideally we want this set to be as small and focused as possible,
+        # and we can tune it by tweaking `_index_rules_by_feature`.
+        candidate_rule_names: Set[str] = set()
        for feature in features:
-            easy_rule_names = easy_rules_by_feature.get(feature)
-            if easy_rule_names:
-                candidate_rule_names.update(easy_rule_names)
+            candidate_rule_names.update(feature_index.rules_by_feature.get(feature, ()))

-        # first, match against the set of rules that have at least one
-        # feature shared with our feature set.
+        # Some rules rely totally on regex features, like the HTTP User-Agent rules.
+        # In these cases, when we encounter any string feature, we have to scan those
+        # regexes to find the candidate rules.
+        # As mentioned above, this is not good for performance, but its required for correctness.
+        #
+        # We may want to try to pre-evaluate these strings, based on their presence in the file,
+        # to reduce the number of evaluations we do here.
+        # See: https://github.com/mandiant/capa/issues/2126
+        #
+        # We may also want to specialize case-insensitive strings, which would enable them to
+        # be indexed, and therefore skip the scanning here, improving performance.
+        # This strategy is described here:
+        # https://github.com/mandiant/capa/issues/2129
+        if feature_index.string_rules:
+
+            # This is a FeatureSet that contains only String features.
+            # Since we'll only be evaluating String/Regex features below, we don't care about
+            # other sorts of features (Mnemonic, Number, etc.) and therefore can save some time
+            # during evaluation.
+            #
+            # Specifically, we can address the issue described here:
+            # https://github.com/mandiant/capa/issues/2063#issuecomment-2095397884
+            # That we spend a lot of time collecting String instances within `Regex.evaluate`.
+            # We don't have to address that issue further as long as we pre-filter the features here.
+            string_features: FeatureSet = {}
+            for feature, locations in features.items():
+                if isinstance(feature, capa.features.common.String):
+                    string_features[feature] = locations
+
+            if string_features:
+                for rule_name, wanted_strings in feature_index.string_rules.items():
+                    for wanted_string in wanted_strings:
+                        if wanted_string.evaluate(string_features):
+                            candidate_rule_names.add(rule_name)
+
+        # Like with String/Regex features above, we have to scan for Bytes to find candidate rules.
+        #
+        # We may want to index bytes when they have a common length, like 16 or 32.
+        # This would help us avoid the scanning here, which would improve performance.
+        # The strategy is described here:
+        # https://github.com/mandiant/capa/issues/2128
+        if feature_index.bytes_rules:
+            bytes_features: FeatureSet = {}
+            for feature, locations in features.items():
+                if isinstance(feature, capa.features.common.Bytes):
+                    bytes_features[feature] = locations
+
+            if bytes_features:
+                for rule_name, wanted_bytess in feature_index.bytes_rules.items():
+                    for wanted_bytes in wanted_bytess:
+                        if wanted_bytes.evaluate(bytes_features):
+                            candidate_rule_names.add(rule_name)
+
+        # No rules can possibly match, so quickly return.
+        if not candidate_rule_names:
+            return (features, {})
+
+        # Here are the candidate rules (before we just had their names).
        candidate_rules = [self.rules[name] for name in candidate_rule_names]
-        features2, easy_matches = ceng.match(candidate_rules, features, addr)

-        # note that we've stored the updated feature set in `features2`.
-        # this contains a superset of the features in `features`;
-        # it contains additional features for any easy rule matches.
-        # we'll pass this feature set to hard rule matching, since one
-        # of those rules might rely on an easy rule match.
+        # Order rules topologically, so that rules with dependencies work correctly.
+        RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
+
        #
-        # the updated feature set from hard matching will go into `features3`.
-        # this is a superset of `features2` is a superset of `features`.
-        # ultimately, this is what we'll return to the caller.
+        # The following is derived from ceng.match
+        # extended to interact with candidate_rules upon rule match.
        #
-        # in each case, we could have assigned the updated feature set back to `features`,
-        # but this is slightly more explicit how we're tracking the data.

-        # now, match against (topologically ordered) list of rules
-        # that we can't really make any guesses about.
-        # these are rules with hard features, like substring/regex/bytes and match statements.
-        hard_rules = [self.rules[name] for name in hard_rule_names]
-        features3, hard_matches = ceng.match(hard_rules, features2, addr)
+        results: ceng.MatchResults = collections.defaultdict(list)

-        # note that above, we probably are skipping matching a bunch of
-        # rules that definitely would never hit.
-        # specifically, "easy rules" that don't share any features with
-        # feature set.
+        # If we match a rule, then we'll add a MatchedRule to the features that will be returned,
+        # but we want to do that in a copy. We'll lazily create the copy below, once a match has
+        # actually been found.
+        augmented_features = features

-        # MatchResults doesn't technically have an .update() method
-        # but a dict does.
-        matches = {}  # type: ignore
-        matches.update(easy_matches)
-        matches.update(hard_matches)
+        while candidate_rules:
+            rule = candidate_rules.pop(0)
+            res = rule.evaluate(augmented_features, short_circuit=True)
+            if res:
+                # we first matched the rule with short circuiting enabled.
+                # this is much faster than without short circuiting.
+                # however, we want to collect all results thoroughly,
+                # so once we've found a match quickly,
+                # go back and capture results without short circuiting.
+                res = rule.evaluate(augmented_features, short_circuit=False)

-        return (features3, matches)
+                # sanity check
+                assert bool(res) is True
+
+                results[rule.name].append((addr, res))
+                # We need to update the current features because subsequent iterations may use newly added features,
+                # such as rule or namespace matches.
+                if augmented_features is features:
+                    # lazily create the copy of features only when a rule matches, since it could be expensive.
+                    augmented_features = collections.defaultdict(set, copy.copy(features))
+
+                ceng.index_rule_matches(augmented_features, rule, [addr])
+
+                # Its possible that we're relying on a MatchedRule (or namespace) feature to be the
+                # uncommon feature used to filter other rules. So, extend the candidate
+                # rules with any of these dependencies. If we find any, also ensure they're
+                # evaluated in the correct topologic order, so that further dependencies work.
+                new_features = [capa.features.common.MatchedRule(rule.name)]
+                for namespace in ceng.get_rule_namespaces(rule):
+                    new_features.append(capa.features.common.MatchedRule(namespace))
+
+                if new_features:
+                    new_candidates: List[str] = []
+                    for new_feature in new_features:
+                        new_candidates.extend(feature_index.rules_by_feature.get(new_feature, ()))
+
+                    if new_candidates:
+                        candidate_rule_names.update(new_candidates)
+                        candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates])
+                        RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
+
+        return (augmented_features, results)
+
+    def match(
+        self, scope: Scope, features: FeatureSet, addr: Address, paranoid=False
+    ) -> Tuple[FeatureSet, ceng.MatchResults]:
+        """
+        Match rules from this ruleset at the given scope against the given features.
+
+        This wrapper around _match exists so that we can assert it matches precisely
+        the same as `capa.engine.match`, just faster.
+
+        This matcher does not handle some edge cases:
+          - top level NOT statements
+              - also top level counted features with zero occurances, like: `count(menmonic(mov)): 0`
+          - nested NOT statements (NOT: NOT: foo)
+
+        We should discourage/forbid these constructs from our rules and add lints for them.
+        TODO(williballenthin): add lints for logic edge cases
+
+        Args:
+          paranoid: when true, demonstrate that the naive matcher agrees with this
+           optimized matcher (much slower! around 10x slower).
+        """
+        features, matches = self._match(scope, features, addr)
+
+        if paranoid:
+            rules: List[Rule] = self.rules_by_scope[scope]
+            paranoid_features, paranoid_matches = capa.engine.match(rules, features, addr)
+
+            if features != paranoid_features:
+                logger.warning("paranoid: %s: %s", scope, addr)
+                for feature in sorted(set(features.keys()) & set(paranoid_features.keys())):
+                    logger.warning("paranoid:   %s", feature)
+
+                for feature in sorted(set(features.keys()) - set(paranoid_features.keys())):
+                    logger.warning("paranoid: + %s", feature)
+
+                for feature in sorted(set(paranoid_features.keys()) - set(features.keys())):
+                    logger.warning("paranoid: - %s", feature)
+
+            assert features == paranoid_features
+            assert set(matches.keys()) == set(paranoid_matches.keys())
+
+        return features, matches


 def is_nursery_rule_path(path: Path) -> bool:
--- a/capa/rules/cache.py
+++ b/capa/rules/cache.py
@@ -159,3 +159,25 @@ def load_cached_ruleset(cache_dir: Path, rule_contents: List[bytes]) -> Optional
        return None
    else:
        return cache.ruleset
+
+
+def generate_rule_cache(rules_dir: Path, cache_dir: Path) -> bool:
+    if not rules_dir.is_dir():
+        logger.error("rules directory %s does not exist", rules_dir)
+        return False
+
+    try:
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        rules = capa.rules.get_rules([rules_dir], cache_dir)
+    except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
+        logger.error("%s", str(e))
+        return False
+
+    content = capa.rules.cache.get_ruleset_content(rules)
+    id = capa.rules.cache.compute_cache_identifier(content)
+    path = capa.rules.cache.get_cache_path(cache_dir, id)
+
+    assert path.exists()
+    logger.info("rules cache saved to: %s", path)
+
+    return True
--- a/capa/version.py
+++ b/capa/version.py
@@ -1,11 +1,11 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-__version__ = "7.0.1"
+__version__ = "7.1.0"


 def get_major_version():
--- a/doc/faq.md
+++ b/doc/faq.md
@@ -0,0 +1,13 @@
+# Frequently Asked Questions
+## Why does capa trigger my Antivirus? Is the tool safe to use?
+The purpose of `capa` is to analyse the capabilities of a potentially malicious application or file. To achieve this, it needs to include portions of the data it is designed to detect as a basis for comparison.
+The release version of capa comes with embedded rules designed to detect common malware functionality. These rules possess similar features to malware and may trigger alerts.
+Additionally, Antivirus and Endpoint Detection and Response (EDR) products may alert on the way capa is packaged using PyInstaller.
+
+## How can I ensure that capa is a benign program?
+We recommend downloading releases only from this repository's Release page. Alternatively, you can build capa yourself or use other Python installation methods. This project is open-source, ensuring transparency for everyone involved.
+For additional peace of mind, you can utilize VirusTotal to analyze unknown files against numerous antivirus products, sandboxes, and other analysis tools. It's worth noting that capa itself operates within VirusTotal.
+
+### Understanding VirusTotal output
+VirusTotal tests files against a large number of Antivirus engines and sandboxes. There's often little insight into Antivirus detections, but you can further inspect dynamic analysis results produced by sandboxes.
+These details can be used to double-check alerts and understand detections.
--- a/doc/img/ghidra_headless_analyzer.png
+++ b/doc/img/ghidra_headless_analyzer.png
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -91,6 +91,12 @@ For more details about creating and using virtual environments, check out the [v

 ##### Install development dependencies

+When developing capa, please use the pinned dependencies found in `requirements.txt`.
+This ensures that everyone has the exact same, reproducible environment.
+Please install these dependencies before install capa (from source or from PyPI):
+
+`$ pip install -r requirements.txt`
+
 We use the following tools to ensure consistent code style and formatting:
  - [black](https://github.com/psf/black) code formatter
  - [isort](https://pypi.org/project/isort/) code formatter
@@ -101,7 +107,8 @@ We use the following tools to ensure consistent code style and formatting:

 To install these development dependencies, run:

-`$ pip install -e /local/path/to/src[dev]`
+- `$ pip install -e /local/path/to/src[dev]` or
+- `$ pip install -e /local/path/to/src[dev,scripts]` to also install all script dependencies

 We use [pre-commit](https://pre-commit.com/) so that its trivial to run the same linters & configuration locally as in CI.

--- a/doc/release.md
+++ b/doc/release.md
@@ -1,7 +1,7 @@
 # Release checklist

 - [ ] Ensure all [milestoned issues/PRs](https://github.com/mandiant/capa/milestones) are addressed, or reassign to a new milestone.
- [ ] Add the `dont merge` label to all PRs that are close to be ready to merge (or merge them if they are ready) in [capa](https://github.com/mandiant/capa/pulls) and [capa-rules](https://github.com/mandiant/capa-rules/pulls).
+- [ ] Add the `don't merge` label to all PRs that are close to be ready to merge (or merge them if they are ready) in [capa](https://github.com/mandiant/capa/pulls) and [capa-rules](https://github.com/mandiant/capa-rules/pulls).
 - [ ] Ensure the [CI workflow succeeds in master](https://github.com/mandiant/capa/actions/workflows/tests.yml?query=branch%3Amaster).
 - [ ] Ensure that `python scripts/lint.py rules/ --thorough` succeeds (only `missing examples` offenses are allowed in the nursery). You can [manually trigger a thorough lint](https://github.com/mandiant/capa-rules/actions/workflows/tests.yml) in CI via the "Run workflow" option. 
 - [ ] Review changes
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,24 +32,76 @@ classifiers = [
    "Topic :: Security",
 ]
 dependencies = [
-    "tqdm==4.66.1",
-    "pyyaml==6.0.1",
-    "tabulate==0.9.0",
-    "colorama==0.4.6",
-    "termcolor==2.4.0",
-    "wcwidth==0.2.13",
-    "ida-settings==2.1.0",
-    "viv-utils[flirt]==0.7.9",
-    "halo==0.0.31",
-    "networkx==3.1",
-    "ruamel.yaml==0.18.5",
-    "vivisect==1.1.1",
-    "pefile==2023.2.7",
-    "pyelftools==0.30",
-    "dnfile==0.14.1",
-    "dncil==1.0.2",
-    "pydantic==2.4.0",
-    "protobuf==4.23.4",
+    # ---------------------------------------
+    # As a library, capa uses lower version bounds
+    # when specifying its dependencies. This lets
+    # other programs that use capa (and other libraries)
+    # to find a compatible set of dependency versions.
+    #
+    # We can optionally pin to specific versions or
+    # limit the upper bound when there's a good reason;
+    # but the default is to assume all greater versions
+    # probably work with capa until proven otherwise.
+    #
+    # The following link provides good background:
+    # https://iscinumpy.dev/post/bound-version-constraints/
+    #
+    # When we develop capa, and when we distribute it as
+    # a standalone binary, we'll use specific versions
+    # that are pinned in requirements.txt.
+    # But the requirements for a library are specified here
+    # and are looser.
+    #
+    # Related discussions:
+    # 
+    #   - https://github.com/mandiant/capa/issues/2053
+    #   - https://github.com/mandiant/capa/pull/2059
+    #   - https://github.com/mandiant/capa/pull/2079
+    #
+    # ---------------------------------------
+    # The following dependency versions were imported
+    # during June 2024 by truncating specific versions to
+    # their major-most version (major version when possible, 
+    # or minor otherwise).
+    # As specific constraints are identified, please provide
+    # comments and context.
+    "tqdm>=4",
+    "pyyaml>=6",
+    "tabulate>=0.9",
+    "colorama>=0.4",
+    "termcolor>=2",
+    "wcwidth>=0.2",
+    "ida-settings>=2",
+    "ruamel.yaml>=0.18",
+    "pefile>=2023.2.7",
+    "pyelftools>=0.31",
+    "pydantic>=2",
+    "rich>=13",
+    "humanize>=4",
+    "protobuf>=5",
+
+    # ---------------------------------------
+    # Dependencies that we develop
+    #
+    # These dependencies are often actively influenced by capa,
+    # so we provide a minimum patch version that includes the
+    # latest bug fixes we need here.
+    "viv-utils[flirt]>=0.7.9",
+    "vivisect>=1.1.1",
+    "dncil>=1.0.2",
+
+    # ---------------------------------------
+    # Dependencies with version caps
+    #
+    # These dependencies must not exceed the version cap,
+    # typically due to dropping support for python releases
+    # we still support.
+
+    # TODO(williballenthin): networkx 3.2 doesn't support python 3.8 while capa does.
+    # https://github.com/mandiant/capa/issues/1966
+    "networkx>=3,<3.2",
+
+    "dnfile>=0.15.0",
 ]
 dynamic = ["version"]

@@ -62,30 +114,32 @@ namespaces = false

 [project.optional-dependencies]
 dev = [
+    # Dev and build dependencies are not relaxed because
+    # we want all developer environments to be consistent.
+    # These dependencies are not used in production environments
+    # and should not conflict with other libraries/tooling.
    "pre-commit==3.5.0",
    "pytest==8.0.0",
-    "pytest-sugar==0.9.7",
+    "pytest-sugar==1.0.0",
    "pytest-instafail==0.5.0",
-    "pytest-cov==4.1.0",
+    "pytest-cov==5.0.0",
    "flake8==7.0.0",
-    "flake8-bugbear==24.1.17",
+    "flake8-bugbear==24.4.26",
    "flake8-encodings==0.5.1",
    "flake8-comprehensions==3.14.0",
    "flake8-logging-format==0.9.0",
    "flake8-no-implicit-concat==0.3.5",
    "flake8-print==5.0.0",
-    "flake8-todos==0.3.0",
+    "flake8-todos==0.3.1",
    "flake8-simplify==0.21.0",
    "flake8-use-pathlib==0.3.0",
    "flake8-copyright==0.2.4",
-    "ruff==0.1.14",
-    "black==24.1.1",
+    "ruff==0.4.8",
+    "black==24.4.2",
    "isort==5.13.2",
-    "mypy==1.8.0",
-    "psutil==5.9.2",
-    "stix2==3.0.1",
-    "requests==2.31.0",
-    "mypy-protobuf==3.5.0",
+    "mypy==1.10.0",
+    "mypy-protobuf==3.6.0",
+    "PyGithub==2.3.0",
    # type stubs for mypy
    "types-backports==0.1.3",
    "types-colorama==0.4.15.11",
@@ -93,14 +147,101 @@ dev = [
    "types-tabulate==0.9.0.20240106",
    "types-termcolor==1.1.4",
    "types-psutil==5.8.23",
-    "types_requests==2.31.0.20240125",
-    "types-protobuf==4.23.0.3",
+    "types_requests==2.32.0.20240602",
+    "types-protobuf==5.26.0.20240422",
+    "deptry==0.16.1"
 ]
 build = [
-    "pyinstaller==6.3.0",
-    "setuptools==69.0.3",
-    "build==1.0.3"
+    # Dev and build dependencies are not relaxed because
+    # we want all developer environments to be consistent.
+    # These dependencies are not used in production environments
+    # and should not conflict with other libraries/tooling.
+    "pyinstaller==6.8.0",
+    "setuptools==70.0.0",
+    "build==1.2.1"
 ]
+scripts = [
+    "jschema_to_python==1.2.3",
+    "psutil==5.9.2",
+    "stix2==3.0.1",
+    "sarif_om==1.0.4",
+    "requests==2.31.0",
+]
+
+[tool.deptry]
+extend_exclude = [
+    "sigs",
+    "tests"
+]
+
+# dependencies marked as first party, to inform deptry that they are local
+known_first_party = [
+    "backports",
+    "binaryninja",
+    "flirt",
+    "ghidra",
+    "ida_bytes",
+    "ida_entry",
+    "ida_funcs",
+    "ida_kernwin",
+    "ida_loader",
+    "ida_nalt",
+    "ida_segment",
+    "idaapi",
+    "idautils",
+    "idc",
+    "java",
+    "netnode",
+    "PyQt5"
+]
+
+[tool.deptry.per_rule_ignores]
+# dependencies defined but not used in the codebase
+DEP002 = [
+    "black",
+    "build",
+    "deptry",
+    "flake8",
+    "flake8-bugbear",
+    "flake8-comprehensions",
+    "flake8-copyright",
+    "flake8-encodings",
+    "flake8-logging-format",
+    "flake8-no-implicit-concat",
+    "flake8-print",
+    "flake8-simplify",
+    "flake8-todos",
+    "flake8-use-pathlib",
+    "isort",
+    "mypy",
+    "mypy-protobuf",
+    "pre-commit",
+    "PyGithub",
+    "pyinstaller",
+    "pytest",
+    "pytest-cov",
+    "pytest-instafail",
+    "pytest-sugar",
+    "ruff",
+    "setuptools",
+    "types-backports",
+    "types-colorama",
+    "types-protobuf",
+    "types-psutil",
+    "types-PyYAML",
+    "types-tabulate",
+    "types-termcolor",
+    "types_requests",
+    "wcwidth"
+]
+
+# dependencies imported but missing from definitions
+DEP003 = [
+    "typing_extensions" # TODO(s-ff): remove when Python 3.9 is deprecated, see #1699
+]
+
+[tool.deptry.package_module_name_map]
+PyGithub = "github"

 [project.urls]
 Homepage = "https://github.com/mandiant/capa"
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,46 @@
+# Dependencies with specific version constraints
+# used during development and building the standalone executables.
+# For these environments, use `pip install -r requirements.txt`
+# before installing capa from source/pypi. This will ensure
+# the following specific versions are used.
+#
+# Initially generated via: pip freeze | grep -v -- "-e"
+# Kept up to date by dependabot.
+annotated-types==0.7.0
+colorama==0.4.6
+cxxfilt==0.2.2
+dncil==1.0.2
+dnfile==0.15.0
+funcy==2.0
+humanize==4.9.0
+ida-netnode==3.0
+ida-settings==2.1.0
+intervaltree==3.1.0
+markdown-it-py==3.0.0
+mdurl==0.1.2
+msgpack==1.0.8
+networkx==3.1
+pefile==2023.2.7
+pip==24.0
+protobuf==5.27.1
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycparser==2.22
+pydantic==2.7.3
+pydantic-core==2.18.4
+pyelftools==0.31
+pygments==2.18.0
+python-flirt==0.8.6
+pyyaml==6.0.1
+rich==13.7.1
+ruamel-yaml==0.18.6
+ruamel-yaml-clib==0.2.8
+setuptools==70.0.0
+six==1.16.0
+sortedcontainers==2.4.0
+tabulate==0.9.0
+termcolor==2.4.0
+tqdm==4.66.4
+viv-utils==0.7.9
+vivisect==1.1.1
+wcwidth==0.2.13
--- a/Show More
+++ b/Show More