mirror of
https://github.com/mandiant/capa.git
synced 2026-03-16 14:59:04 -07:00
Compare commits
3 Commits
mapa
...
backend-la
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
826377530d | ||
|
|
9c90f0e554 | ||
|
|
7431c67bbe |
@@ -1,27 +0,0 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "9.3.1"
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
filename = "capa/version.py"
|
||||
search = '__version__ = "{current_version}"'
|
||||
replace = '__version__ = "{new_version}"'
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
filename = "capa/ida/plugin/ida-plugin.json"
|
||||
search = '"version": "{current_version}"'
|
||||
replace = '"version": "{new_version}"'
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
filename = "capa/ida/plugin/ida-plugin.json"
|
||||
search = '"flare-capa=={current_version}"'
|
||||
replace = '"flare-capa=={new_version}"'
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
filename = "CHANGELOG.md"
|
||||
search = "v{current_version}...master"
|
||||
replace = "v{current_version}...{new_version}"
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
filename = "CHANGELOG.md"
|
||||
search = "master (unreleased)"
|
||||
replace = "v{new_version}"
|
||||
7
.github/dependabot.yml
vendored
7
.github/dependabot.yml
vendored
@@ -4,13 +4,6 @@ updates:
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
groups:
|
||||
vivisect:
|
||||
patterns:
|
||||
- "vivisect"
|
||||
- "pyasn1"
|
||||
- "pyasn1-modules"
|
||||
- "msgpack"
|
||||
ignore:
|
||||
- dependency-name: "*"
|
||||
update-types: ["version-update:semver-patch"]
|
||||
|
||||
2
.github/flake8.ini
vendored
2
.github/flake8.ini
vendored
@@ -33,6 +33,8 @@ per-file-ignores =
|
||||
scripts/*: T201
|
||||
# capa.exe is meant to print output
|
||||
capa/main.py: T201
|
||||
# IDA tests emit results to output window so need to print
|
||||
tests/test_ida_features.py: T201
|
||||
# utility used to find the Binary Ninja API via invoking python.exe
|
||||
capa/features/extractors/binja/find_binja_api.py: T201
|
||||
|
||||
|
||||
3
.github/mypy/mypy.ini
vendored
3
.github/mypy/mypy.ini
vendored
@@ -63,9 +63,6 @@ ignore_missing_imports = True
|
||||
[mypy-PyQt5.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-binaryninja]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-binaryninja.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
|
||||
2
.github/pull_request_template.md
vendored
2
.github/pull_request_template.md
vendored
@@ -20,5 +20,3 @@ closes #issue_number
|
||||
- [ ] No new tests needed
|
||||
<!-- Please help us keeping capa documentation up-to-date -->
|
||||
- [ ] No documentation update needed
|
||||
<!-- Please indicate if and how you have used AI to generate (parts of) your code submission. Include your prompt, model, tool, etc. -->
|
||||
- [ ] This submission includes AI-generated code and I have provided details in the description.
|
||||
|
||||
7
.github/pyinstaller/pyinstaller.spec
vendored
7
.github/pyinstaller/pyinstaller.spec
vendored
@@ -17,8 +17,6 @@ import sys
|
||||
|
||||
import capa.rules.cache
|
||||
|
||||
from PyInstaller.utils.hooks import collect_submodules
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# SPECPATH is a global variable which points to .spec file path
|
||||
@@ -36,7 +34,6 @@ a = Analysis(
|
||||
["../../capa/main.py"],
|
||||
pathex=["capa"],
|
||||
binaries=None,
|
||||
hiddenimports=collect_submodules('rich'),
|
||||
datas=[
|
||||
# when invoking pyinstaller from the project root,
|
||||
# this gets invoked from the directory of the spec file,
|
||||
@@ -77,10 +74,6 @@ a = Analysis(
|
||||
# only be installed locally.
|
||||
"binaryninja",
|
||||
"ida",
|
||||
"ghidra",
|
||||
# remove once https://github.com/mandiant/capa/issues/2681 has
|
||||
# been addressed by PyInstaller
|
||||
"pkg_resources",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
62
.github/workflows/black-format.yml
vendored
62
.github/workflows/black-format.yml
vendored
@@ -1,62 +0,0 @@
|
||||
name: black auto-format
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [ master ]
|
||||
paths-ignore:
|
||||
- 'web/**'
|
||||
- 'doc/**'
|
||||
- '**.md'
|
||||
workflow_dispatch: # allow manual trigger
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
jobs:
|
||||
black-format:
|
||||
# only run on dependabot PRs or manual trigger
|
||||
if: github.actor == 'dependabot[bot]' || github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
|
||||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
# need a token with write access to push the commit
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Set up Python 3.13
|
||||
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
|
||||
with:
|
||||
python-version: "3.13"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -r requirements.txt
|
||||
pip install -e .[dev,scripts]
|
||||
|
||||
- name: Run isort
|
||||
run: pre-commit run isort --all-files
|
||||
|
||||
- name: Run black/continue
|
||||
# black returns non-zero error code after formatting, which is what we expect
|
||||
continue-on-error: true
|
||||
run: pre-commit run black --all-files
|
||||
|
||||
- name: Check for changes
|
||||
id: changes
|
||||
run: |
|
||||
if git diff --quiet; then
|
||||
echo "has_changes=false" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "has_changes=true" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Commit and push formatting changes
|
||||
if: steps.changes.outputs.has_changes == 'true'
|
||||
run: |
|
||||
git config user.name "${GITHUB_ACTOR}"
|
||||
git config user.email "${GITHUB_ACTOR_ID}+${GITHUB_ACTOR}@users.noreply.github.com"
|
||||
git add -A
|
||||
git commit -m "style: auto-format with black and isort"
|
||||
git push
|
||||
108
.github/workflows/build.yml
vendored
108
.github/workflows/build.yml
vendored
@@ -9,7 +9,6 @@ on:
|
||||
- '**.md'
|
||||
release:
|
||||
types: [edited, published]
|
||||
workflow_dispatch: # manual trigger for testing
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
@@ -23,43 +22,24 @@ jobs:
|
||||
fail-fast: true
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
- os: ubuntu-20.04
|
||||
# use old linux so that the shared library versioning is more portable
|
||||
artifact_name: capa
|
||||
asset_name: linux
|
||||
python_version: '3.10'
|
||||
# for Ghidra
|
||||
java-version: '21'
|
||||
ghidra-version: '12.0'
|
||||
public-version: 'PUBLIC_20251205'
|
||||
ghidra-sha256: 'af43e8cfb2fa4490cf6020c3a2bde25c159d83f45236a0542688a024e8fc1941'
|
||||
- os: ubuntu-22.04-arm
|
||||
artifact_name: capa
|
||||
asset_name: linux-arm64
|
||||
python_version: '3.10'
|
||||
- os: ubuntu-22.04
|
||||
- os: ubuntu-20.04
|
||||
artifact_name: capa
|
||||
asset_name: linux-py312
|
||||
python_version: '3.12'
|
||||
- os: windows-2022
|
||||
- os: windows-2019
|
||||
artifact_name: capa.exe
|
||||
asset_name: windows
|
||||
python_version: '3.10'
|
||||
# Windows 11 ARM64 complains of conflicting package version
|
||||
# Additionally, there is no ARM64 build of Python for Python 3.10 on Windows 11 ARM: https://raw.githubusercontent.com/actions/python-versions/main/versions-manifest.json
|
||||
#- os: windows-11-arm
|
||||
# artifact_name: capa.exe
|
||||
# asset_name: windows-arm64
|
||||
# python_version: '3.12'
|
||||
- os: macos-15-intel
|
||||
# macos-15-intel is the lowest native intel build
|
||||
- os: macos-13
|
||||
# use older macOS for assumed better portability
|
||||
artifact_name: capa
|
||||
asset_name: macos
|
||||
python_version: '3.10'
|
||||
- os: macos-14
|
||||
artifact_name: capa
|
||||
asset_name: macos-arm64
|
||||
python_version: '3.10'
|
||||
steps:
|
||||
- name: Checkout capa
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
|
||||
@@ -69,7 +49,7 @@ jobs:
|
||||
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
- if: matrix.os == 'ubuntu-22.04' || matrix.os == 'ubuntu-22.04-arm'
|
||||
- if: matrix.os == 'ubuntu-20.04'
|
||||
run: sudo apt-get install -y libyaml-dev
|
||||
- name: Upgrade pip, setuptools
|
||||
run: python -m pip install --upgrade pip setuptools
|
||||
@@ -79,28 +59,6 @@ jobs:
|
||||
pip install -e .[build]
|
||||
- name: Build standalone executable
|
||||
run: pyinstaller --log-level DEBUG .github/pyinstaller/pyinstaller.spec
|
||||
- name: Does it run without warnings or errors?
|
||||
shell: bash
|
||||
run: |
|
||||
if [[ "${{ matrix.os }}" == "windows-2022" ]] || [[ "${{ matrix.os }}" == "windows-11-arm" ]]; then
|
||||
EXECUTABLE=".\\dist\\capa"
|
||||
else
|
||||
EXECUTABLE="./dist/capa"
|
||||
fi
|
||||
|
||||
output=$(${EXECUTABLE} --version 2>&1)
|
||||
exit_code=$?
|
||||
|
||||
echo "${output}"
|
||||
echo "${exit_code}"
|
||||
|
||||
if echo "${output}" | grep -iE 'error|warning'; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${exit_code}" -ne 0 ]]; then
|
||||
exit 1
|
||||
fi
|
||||
- name: Does it run (PE)?
|
||||
run: dist/capa -d "tests/data/Practical Malware Analysis Lab 01-01.dll_"
|
||||
- name: Does it run (Shellcode)?
|
||||
@@ -111,29 +69,39 @@ jobs:
|
||||
run: |
|
||||
7z e "tests/data/dynamic/cape/v2.2/d46900384c78863420fb3e297d0a2f743cd2b6b3f7f82bf64059a168e07aceb7.json.gz"
|
||||
dist/capa -d "d46900384c78863420fb3e297d0a2f743cd2b6b3f7f82bf64059a168e07aceb7.json"
|
||||
- name: Set up Java ${{ matrix.java-version }}
|
||||
if: matrix.os == 'ubuntu-22.04' && matrix.python_version == '3.10'
|
||||
uses: actions/setup-java@387ac29b308b003ca37ba93a6cab5eb57c8f5f93 # v4.0.0
|
||||
with:
|
||||
distribution: 'temurin'
|
||||
java-version: ${{ matrix.java-version }}
|
||||
- name: Install Ghidra ${{ matrix.ghidra-version }}
|
||||
if: matrix.os == 'ubuntu-22.04' && matrix.python_version == '3.10'
|
||||
run: |
|
||||
mkdir ./.github/ghidra
|
||||
wget "https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_${{ matrix.ghidra-version }}_build/ghidra_${{ matrix.ghidra-version }}_${{ matrix.public-version }}.zip" -O ./.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC.zip
|
||||
echo "${{ matrix.ghidra-sha256 }} ./.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC.zip" | sha256sum -c -
|
||||
unzip .github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC.zip -d .github/ghidra/
|
||||
- name: Does it run (Ghidra)?
|
||||
if: matrix.os == 'ubuntu-22.04' && matrix.python_version == '3.10'
|
||||
env:
|
||||
GHIDRA_INSTALL_DIR: ${{ github.workspace }}/.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC
|
||||
run: dist/capa -b ghidra -d "tests/data/Practical Malware Analysis Lab 01-01.dll_"
|
||||
- uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
|
||||
with:
|
||||
name: ${{ matrix.asset_name }}
|
||||
path: dist/${{ matrix.artifact_name }}
|
||||
|
||||
test_run:
|
||||
name: Test run on ${{ matrix.os }} / ${{ matrix.asset_name }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
needs: [build]
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
# OSs not already tested above
|
||||
- os: ubuntu-22.04
|
||||
artifact_name: capa
|
||||
asset_name: linux
|
||||
- os: ubuntu-22.04
|
||||
artifact_name: capa
|
||||
asset_name: linux-py312
|
||||
- os: windows-2022
|
||||
artifact_name: capa.exe
|
||||
asset_name: windows
|
||||
steps:
|
||||
- name: Download ${{ matrix.asset_name }}
|
||||
uses: actions/download-artifact@eaceaf801fd36c7dee90939fad912460b18a1ffe # v4.1.2
|
||||
with:
|
||||
name: ${{ matrix.asset_name }}
|
||||
- name: Set executable flag
|
||||
if: matrix.os != 'windows-2022'
|
||||
run: chmod +x ${{ matrix.artifact_name }}
|
||||
- name: Run capa
|
||||
run: ./${{ matrix.artifact_name }} -h
|
||||
|
||||
zip_and_upload:
|
||||
# upload zipped binaries to Release page
|
||||
if: github.event_name == 'release'
|
||||
@@ -145,18 +113,12 @@ jobs:
|
||||
include:
|
||||
- asset_name: linux
|
||||
artifact_name: capa
|
||||
- asset_name: linux-arm64
|
||||
artifact_name: capa
|
||||
- asset_name: linux-py312
|
||||
artifact_name: capa
|
||||
- asset_name: windows
|
||||
artifact_name: capa.exe
|
||||
#- asset_name: windows-arm64
|
||||
# artifact_name: capa.exe
|
||||
- asset_name: macos
|
||||
artifact_name: capa
|
||||
- asset_name: macos-arm64
|
||||
artifact_name: capa
|
||||
steps:
|
||||
- name: Download ${{ matrix.asset_name }}
|
||||
uses: actions/download-artifact@eaceaf801fd36c7dee90939fad912460b18a1ffe # v4.1.2
|
||||
@@ -167,7 +129,7 @@ jobs:
|
||||
- name: Set zip name
|
||||
run: echo "zip_name=capa-${GITHUB_REF#refs/tags/}-${{ matrix.asset_name }}.zip" >> $GITHUB_ENV
|
||||
- name: Zip ${{ matrix.artifact_name }} into ${{ env.zip_name }}
|
||||
run: zip ${ZIP_NAME} ${{ matrix.artifact_name }}
|
||||
run: zip ${{ env.zip_name }} ${{ matrix.artifact_name }}
|
||||
- name: Upload ${{ env.zip_name }} to GH Release
|
||||
uses: svenstaro/upload-release-action@2728235f7dc9ff598bd86ce3c274b74f802d2208 # v2
|
||||
with:
|
||||
|
||||
4
.github/workflows/pip-audit.yml
vendored
4
.github/workflows/pip-audit.yml
vendored
@@ -14,8 +14,8 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- uses: pypa/gh-action-pip-audit@1220774d901786e6f652ae159f7b6bc8fea6d266 # v1.1.0
|
||||
- uses: pypa/gh-action-pip-audit@v1.0.8
|
||||
with:
|
||||
inputs: .
|
||||
|
||||
2
.github/workflows/publish.yml
vendored
2
.github/workflows/publish.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
with:
|
||||
path: dist/*
|
||||
- name: publish package
|
||||
uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # release/v1.12.4
|
||||
uses: pypa/gh-action-pypi-publish@f5622bde02b04381239da3573277701ceca8f6a0 # release/v1
|
||||
with:
|
||||
skip-existing: true
|
||||
verbose: true
|
||||
|
||||
4
.github/workflows/tag.yml
vendored
4
.github/workflows/tag.yml
vendored
@@ -21,10 +21,8 @@ jobs:
|
||||
# user information is needed to create annotated tags (with a message)
|
||||
git config user.email 'capa-dev@mandiant.com'
|
||||
git config user.name 'Capa Bot'
|
||||
name=${GITHUB_EVENT_RELEASE_TAG_NAME}
|
||||
name=${{ github.event.release.tag_name }}
|
||||
git tag $name -m "https://github.com/mandiant/capa/releases/$name"
|
||||
env:
|
||||
GITHUB_EVENT_RELEASE_TAG_NAME: ${{ github.event.release.tag_name }}
|
||||
# TODO update branch name-major=${name%%.*}
|
||||
- name: Push tag to capa-rules
|
||||
uses: ad-m/github-push-action@d91a481090679876dfc4178fef17f286781251df # v0.8.0
|
||||
|
||||
111
.github/workflows/tests.yml
vendored
111
.github/workflows/tests.yml
vendored
@@ -42,10 +42,10 @@ jobs:
|
||||
- name: Checkout capa
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
|
||||
# use latest available python to take advantage of best performance
|
||||
- name: Set up Python 3.13
|
||||
- name: Set up Python 3.12
|
||||
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
|
||||
with:
|
||||
python-version: "3.13"
|
||||
python-version: "3.12"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -r requirements.txt
|
||||
@@ -70,10 +70,10 @@ jobs:
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
|
||||
with:
|
||||
submodules: recursive
|
||||
- name: Set up Python 3.13
|
||||
- name: Set up Python 3.12
|
||||
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
|
||||
with:
|
||||
python-version: "3.13"
|
||||
python-version: "3.12"
|
||||
- name: Install capa
|
||||
run: |
|
||||
pip install -r requirements.txt
|
||||
@@ -88,14 +88,16 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2022, macos-15-intel, macos-14]
|
||||
os: [ubuntu-20.04, windows-2019, macos-13]
|
||||
# across all operating systems
|
||||
python-version: ["3.10", "3.13"]
|
||||
python-version: ["3.10", "3.11"]
|
||||
include:
|
||||
# on Ubuntu run these as well
|
||||
- os: ubuntu-22.04
|
||||
- os: ubuntu-20.04
|
||||
python-version: "3.10"
|
||||
- os: ubuntu-20.04
|
||||
python-version: "3.11"
|
||||
- os: ubuntu-22.04
|
||||
- os: ubuntu-20.04
|
||||
python-version: "3.12"
|
||||
steps:
|
||||
- name: Checkout capa with submodules
|
||||
@@ -107,17 +109,12 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install pyyaml
|
||||
if: matrix.os == 'ubuntu-22.04'
|
||||
if: matrix.os == 'ubuntu-20.04'
|
||||
run: sudo apt-get install -y libyaml-dev
|
||||
- name: Install capa
|
||||
run: |
|
||||
pip install -r requirements.txt
|
||||
pip install -e .[dev,scripts]
|
||||
- name: Cache vivisect workspaces
|
||||
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
|
||||
with:
|
||||
path: tests/data/**/*.viv
|
||||
key: viv-${{ runner.os }}-${{ runner.arch }}-${{ matrix.python-version }}-${{ hashFiles('**/requirements.txt') }}
|
||||
- name: Run tests (fast)
|
||||
# this set of tests runs about 80% of the cases in 20% of the time,
|
||||
# and should catch most errors quickly.
|
||||
@@ -134,7 +131,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.10", "3.13"]
|
||||
python-version: ["3.10", "3.11"]
|
||||
steps:
|
||||
- name: Checkout capa with submodules
|
||||
# do only run if BN_SERIAL is available, have to do this in every step, see https://github.com/orgs/community/discussions/26726#discussioncomment-3253118
|
||||
@@ -160,7 +157,7 @@ jobs:
|
||||
run: |
|
||||
mkdir ./.github/binja
|
||||
curl "https://raw.githubusercontent.com/Vector35/binaryninja-api/6812c97/scripts/download_headless.py" -o ./.github/binja/download_headless.py
|
||||
python ./.github/binja/download_headless.py --serial ${BN_SERIAL} --output .github/binja/BinaryNinja-headless.zip
|
||||
python ./.github/binja/download_headless.py --serial ${{ env.BN_SERIAL }} --output .github/binja/BinaryNinja-headless.zip
|
||||
unzip .github/binja/BinaryNinja-headless.zip -d .github/binja/
|
||||
python .github/binja/binaryninja/scripts/install_api.py --install-on-root --silent
|
||||
- name: Run tests
|
||||
@@ -171,16 +168,16 @@ jobs:
|
||||
|
||||
ghidra-tests:
|
||||
name: Ghidra tests for ${{ matrix.python-version }}
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-20.04
|
||||
needs: [tests]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.10", "3.13"]
|
||||
java-version: ["21"]
|
||||
ghidra-version: ["12.0"]
|
||||
public-version: ["PUBLIC_20251205"] # for ghidra releases
|
||||
ghidra-sha256: ['af43e8cfb2fa4490cf6020c3a2bde25c159d83f45236a0542688a024e8fc1941']
|
||||
python-version: ["3.10", "3.11"]
|
||||
java-version: ["17"]
|
||||
ghidra-version: ["11.0.1"]
|
||||
public-version: ["PUBLIC_20240130"] # for ghidra releases
|
||||
ghidrathon-version: ["4.0.0"]
|
||||
steps:
|
||||
- name: Checkout capa with submodules
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
|
||||
@@ -199,66 +196,26 @@ jobs:
|
||||
run: |
|
||||
mkdir ./.github/ghidra
|
||||
wget "https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_${{ matrix.ghidra-version }}_build/ghidra_${{ matrix.ghidra-version }}_${{ matrix.public-version }}.zip" -O ./.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC.zip
|
||||
echo "${{ matrix.ghidra-sha256 }} ./.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC.zip" | sha256sum -c -
|
||||
unzip .github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC.zip -d .github/ghidra/
|
||||
- name: Install Ghidrathon
|
||||
run : |
|
||||
mkdir ./.github/ghidrathon
|
||||
wget "https://github.com/mandiant/Ghidrathon/releases/download/v${{ matrix.ghidrathon-version }}/Ghidrathon-v${{ matrix.ghidrathon-version}}.zip" -O ./.github/ghidrathon/ghidrathon-v${{ matrix.ghidrathon-version }}.zip
|
||||
unzip .github/ghidrathon/ghidrathon-v${{ matrix.ghidrathon-version }}.zip -d .github/ghidrathon/
|
||||
python -m pip install -r .github/ghidrathon/requirements.txt
|
||||
python .github/ghidrathon/ghidrathon_configure.py $(pwd)/.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC
|
||||
unzip .github/ghidrathon/Ghidrathon-v${{ matrix.ghidrathon-version }}.zip -d .github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC/Ghidra/Extensions
|
||||
- name: Install pyyaml
|
||||
run: sudo apt-get install -y libyaml-dev
|
||||
- name: Install capa with Ghidra extra
|
||||
run: |
|
||||
pip install -e .[dev,ghidra]
|
||||
- name: Run tests
|
||||
env:
|
||||
GHIDRA_INSTALL_DIR: ${{ github.workspace }}/.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC
|
||||
run: pytest -v tests/test_ghidra_features.py
|
||||
|
||||
idalib-tests:
|
||||
name: IDA ${{ matrix.ida.version }} tests for ${{ matrix.python-version }}
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [tests]
|
||||
env:
|
||||
IDA_LICENSE_ID: ${{ secrets.IDA_LICENSE_ID }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.10", "3.13"]
|
||||
ida:
|
||||
- version: 9.0
|
||||
slug: "release/9.0/ida-essential/ida-essential_90_x64linux.run"
|
||||
- version: 9.1
|
||||
slug: "release/9.1/ida-essential/ida-essential_91_x64linux.run"
|
||||
- version: 9.2
|
||||
slug: "release/9.2/ida-essential/ida-essential_92_x64linux.run"
|
||||
steps:
|
||||
- name: Checkout capa with submodules
|
||||
# do only run if IDA_LICENSE_ID is available, have to do this in every step, see https://github.com/orgs/community/discussions/26726#discussioncomment-3253118
|
||||
if: ${{ env.IDA_LICENSE_ID != 0 }}
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
|
||||
with:
|
||||
submodules: recursive
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
if: ${{ env.IDA_LICENSE_ID != 0 }}
|
||||
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Setup uv
|
||||
if: ${{ env.IDA_LICENSE_ID != 0 }}
|
||||
uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # v7.2.0
|
||||
- name: Install dependencies
|
||||
if: ${{ env.IDA_LICENSE_ID != 0 }}
|
||||
run: sudo apt-get install -y libyaml-dev
|
||||
- name: Install capa
|
||||
if: ${{ env.IDA_LICENSE_ID != 0 }}
|
||||
run: |
|
||||
pip install -r requirements.txt
|
||||
pip install -e .[dev,scripts]
|
||||
pip install idapro
|
||||
- name: Install IDA ${{ matrix.ida.version }}
|
||||
if: ${{ env.IDA_LICENSE_ID != 0 }}
|
||||
run: |
|
||||
uv run hcli --disable-updates ida install --download-id ${{ matrix.ida.slug }} --license-id ${{ secrets.IDA_LICENSE_ID }} --set-default --yes
|
||||
env:
|
||||
HCLI_API_KEY: ${{ secrets.HCLI_API_KEY }}
|
||||
IDA_LICENSE_ID: ${{ secrets.IDA_LICENSE_ID }}
|
||||
- name: Run tests
|
||||
if: ${{ env.IDA_LICENSE_ID != 0 }}
|
||||
run: pytest -v tests/test_idalib_features.py # explicitly refer to the idalib tests for performance. other tests run above.
|
||||
run: |
|
||||
mkdir ./.github/ghidra/project
|
||||
.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC/support/analyzeHeadless .github/ghidra/project ghidra_test -Import ./tests/data/mimikatz.exe_ -ScriptPath ./tests/ -PostScript test_ghidra_features.py > ../output.log
|
||||
cat ../output.log
|
||||
exit_code=$(cat ../output.log | grep exit | awk '{print $NF}')
|
||||
exit $exit_code
|
||||
|
||||
|
||||
22
.github/workflows/web-release.yml
vendored
22
.github/workflows/web-release.yml
vendored
@@ -18,18 +18,14 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set release name
|
||||
run: echo "RELEASE_NAME=capa-explorer-web-v${GITHUB_EVENT_INPUTS_VERSION}-${GITHUB_SHA::7}" >> $GITHUB_ENV
|
||||
env:
|
||||
GITHUB_EVENT_INPUTS_VERSION: ${{ github.event.inputs.version }}
|
||||
run: echo "RELEASE_NAME=capa-explorer-web-v${{ github.event.inputs.version }}-${GITHUB_SHA::7}" >> $GITHUB_ENV
|
||||
|
||||
- name: Check if release already exists
|
||||
run: |
|
||||
if ls web/explorer/releases/capa-explorer-web-v${GITHUB_EVENT_INPUTS_VERSION}-* 1> /dev/null 2>&1; then
|
||||
echo "::error:: A release with version ${GITHUB_EVENT_INPUTS_VERSION} already exists"
|
||||
if ls web/explorer/releases/capa-explorer-web-v${{ github.event.inputs.version }}-* 1> /dev/null 2>&1; then
|
||||
echo "::error:: A release with version ${{ github.event.inputs.version }} already exists"
|
||||
exit 1
|
||||
fi
|
||||
env:
|
||||
GITHUB_EVENT_INPUTS_VERSION: ${{ github.event.inputs.version }}
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@0a44ba7841725637a19e28fa30b79a866c81b0a6 # v4.0.4
|
||||
@@ -47,24 +43,24 @@ jobs:
|
||||
working-directory: web/explorer
|
||||
|
||||
- name: Compress bundle
|
||||
run: zip -r ${RELEASE_NAME}.zip capa-explorer-web
|
||||
run: zip -r ${{ env.RELEASE_NAME }}.zip capa-explorer-web
|
||||
working-directory: web/explorer
|
||||
|
||||
- name: Create releases directory
|
||||
run: mkdir -vp web/explorer/releases
|
||||
|
||||
- name: Move release to releases folder
|
||||
run: mv web/explorer/${RELEASE_NAME}.zip web/explorer/releases
|
||||
run: mv web/explorer/${{ env.RELEASE_NAME }}.zip web/explorer/releases
|
||||
|
||||
- name: Compute release SHA256 hash
|
||||
run: |
|
||||
echo "RELEASE_SHA256=$(sha256sum web/explorer/releases/${RELEASE_NAME}.zip | awk '{print $1}')" >> $GITHUB_ENV
|
||||
echo "RELEASE_SHA256=$(sha256sum web/explorer/releases/${{ env.RELEASE_NAME }}.zip | awk '{print $1}')" >> $GITHUB_ENV
|
||||
|
||||
- name: Update CHANGELOG.md
|
||||
run: |
|
||||
echo "## ${RELEASE_NAME}" >> web/explorer/releases/CHANGELOG.md
|
||||
echo "## ${{ env.RELEASE_NAME }}" >> web/explorer/releases/CHANGELOG.md
|
||||
echo "- Release Date: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> web/explorer/releases/CHANGELOG.md
|
||||
echo "- SHA256: ${RELEASE_SHA256}" >> web/explorer/releases/CHANGELOG.md
|
||||
echo "- SHA256: ${{ env.RELEASE_SHA256 }}" >> web/explorer/releases/CHANGELOG.md
|
||||
echo "" >> web/explorer/releases/CHANGELOG.md
|
||||
cat web/explorer/releases/CHANGELOG.md
|
||||
|
||||
@@ -77,7 +73,7 @@ jobs:
|
||||
run: |
|
||||
git config --local user.email "capa-dev@mandiant.com"
|
||||
git config --local user.name "Capa Bot"
|
||||
git add -f web/explorer/releases/${RELEASE_NAME}.zip web/explorer/releases/CHANGELOG.md
|
||||
git add -f web/explorer/releases/${{ env.RELEASE_NAME }}.zip web/explorer/releases/CHANGELOG.md
|
||||
git add -u web/explorer/releases/
|
||||
|
||||
- name: Create Pull Request
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -122,7 +122,6 @@ scripts/perf/*.zip
|
||||
*/.DS_Store
|
||||
Pipfile
|
||||
Pipfile.lock
|
||||
uv.lock
|
||||
/cache/
|
||||
.github/binja/binaryninja
|
||||
.github/binja/download_headless.py
|
||||
|
||||
@@ -136,8 +136,8 @@ repos:
|
||||
- "tests/"
|
||||
- "--ignore=tests/test_binja_features.py"
|
||||
- "--ignore=tests/test_ghidra_features.py"
|
||||
- "--ignore=tests/test_ida_features.py"
|
||||
- "--ignore=tests/test_viv_features.py"
|
||||
- "--ignore=tests/test_idalib_features.py"
|
||||
- "--ignore=tests/test_main.py"
|
||||
- "--ignore=tests/test_scripts.py"
|
||||
always_run: true
|
||||
|
||||
227
CHANGELOG.md
227
CHANGELOG.md
@@ -4,236 +4,29 @@
|
||||
|
||||
### New Features
|
||||
|
||||
- ghidra: support PyGhidra @mike-hunhoff #2788
|
||||
- vmray: extract number features from whitelisted void_ptr parameters (hKey, hKeyRoot) @adeboyedn #2835
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
### New Rules (23)
|
||||
|
||||
- nursery/run-as-nodejs-native-module mehunhoff@google.com
|
||||
- nursery/inject-shellcode-using-thread-pool-work-insertion-with-tp_io still@teamt5.org
|
||||
- nursery/inject-shellcode-using-thread-pool-work-insertion-with-tp_timer still@teamt5.org
|
||||
- nursery/inject-shellcode-using-thread-pool-work-insertion-with-tp_work still@teamt5.org
|
||||
- data-manipulation/encryption/hc-256/encrypt-data-using-hc-256 wballenthin@hex-rays.com
|
||||
- anti-analysis/anti-llm/terminate-anthropic-session-via-magic-strings wballenthin@hex-rays.com
|
||||
- nursery/access-aws-credentials maximemorin@google.com
|
||||
- nursery/access-cloudflare-credentials maximemorin@google.com
|
||||
- nursery/access-docker-credentials maximemorin@google.com
|
||||
- nursery/access-gcp-credentials maximemorin@google.com
|
||||
- nursery/access-kubernetes-credentials maximemorin@google.com
|
||||
- nursery/enumerate-aws-cloudformation maximemorin@google.com
|
||||
- nursery/enumerate-aws-cloudtrail maximemorin@google.com
|
||||
- nursery/enumerate-aws-direct-connect maximemorin@google.com
|
||||
- nursery/enumerate-aws-ec2 maximemorin@google.com
|
||||
- nursery/enumerate-aws-iam maximemorin@google.com
|
||||
- nursery/enumerate-aws-s3 maximemorin@google.com
|
||||
- nursery/enumerate-aws-support-cases maximemorin@google.com
|
||||
- persistence/registry/persist-via-shellserviceobjectdelayload-registry-key xpzhxhm@gmail.com
|
||||
- nursery/get-http-response-date @cosmoworker
|
||||
- host-interaction/process/create/create-process-in-dotnet moritz.raabe@mandiant.com social.tarang@gmail.com
|
||||
- nursery/read-file-in-dotnet moritz.raabe@mandiant.com anushka.virgaonkar@mandiant.com
|
||||
- nursery/write-file-in-dotnet william.ballenthin@mandiant.com anushka.virgaonkar@mandiant.com
|
||||
-
|
||||
|
||||
### Bug Fixes
|
||||
- main: suggest --os flag in unsupported OS error message to help users override ELF OS detection @devs6186 #2577
|
||||
- render: escape sample-controlled strings before passing to Rich to prevent MarkupError @devs6186 #2699
|
||||
- rules: handle empty or invalid YAML documents gracefully in `Rule.from_yaml` and `get_rules` @devs6186 #2900
|
||||
- Fixed insecure deserialization vulnerability in YAML loading @0x1622 (#2770)
|
||||
- loader: gracefully handle ELF files with unsupported architectures kamranulhaq2002@gmail.com #2800
|
||||
- loader: handle SegmentationViolation for malformed ELF files @kami922 #2799
|
||||
- lint: disable rule caching during linting @Maijin #2817
|
||||
- vmray: skip processes with invalid PID or missing filename @EclipseAditya #2807
|
||||
- features: fix Regex.get_value_str() returning escaped pattern instead of raw regex @EclipseAditya #1909
|
||||
- render: use default styling for dynamic -vv API/call details so they are easier to see @devs6186 #1865
|
||||
- loader: handle struct.error from dnfile and show clear CorruptFile message @devs6186 #2442
|
||||
- address: fix TypeError when sorting locations containing mixed address types @devs6186 #2195
|
||||
- loader: skip PE files with unrealistically large section virtual sizes to prevent resource exhaustion @devs6186 #1989
|
||||
|
||||
### capa Explorer Web
|
||||
- webui: fix 404 for "View rule in capa-rules" by using encodeURIComponent for rule name in URL @devs6186 #2482
|
||||
- webui: show error when JSON does not follow expected result document schema; suggest reanalyzing for VT URLs @devs6186 #2363
|
||||
- webui: fix global search to match feature types (match, regex, api, …) @devs6186 #2349
|
||||
|
||||
### capa Explorer IDA Pro plugin
|
||||
|
||||
### Performance
|
||||
|
||||
- perf: eliminate O(n²) tuple growth and reduce per-match overhead @devs6186 #2890
|
||||
|
||||
### Development
|
||||
|
||||
- doc: document that default output shows top-level matches only; -v/-vv show nested matches @devs6186 #1410
|
||||
- doc: fix typo in usage.md, add documentation links to README @devs6186 #2274
|
||||
- doc: add table comparing ways to consume capa output (CLI, IDA, Ghidra, dynamic sandbox, web) @devs6186 #2273
|
||||
- binja: add mypy config for top-level binaryninja module to fix mypy issues @devs6186 #2399
|
||||
- ci: deprecate macos-13 runner and use Python v3.13 for testing @mike-hunhoff #2777
|
||||
- ci: pin pip-audit action SHAs and update to v1.1.0 @kami922 #1131
|
||||
|
||||
### Raw diffs
|
||||
- [capa v9.3.1...master](https://github.com/mandiant/capa/compare/v9.3.1...master)
|
||||
- [capa-rules v9.3.1...master](https://github.com/mandiant/capa-rules/compare/v9.3.1...master)
|
||||
|
||||
## v9.3.1
|
||||
|
||||
This patch release fixes a missing import for the capa explorer plugin for IDA Pro.
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- add missing ida-netnode dependency to project.toml @mike-hunhoff #2765
|
||||
|
||||
### Development
|
||||
|
||||
- ci: bump binja min version @mike-hunhoff #2763
|
||||
|
||||
### Raw diffs
|
||||
- [capa v9.3.0...master](https://github.com/mandiant/capa/compare/v9.3.0...master)
|
||||
- [capa-rules v9.3.0...master](https://github.com/mandiant/capa-rules/compare/v9.3.0...master)
|
||||
|
||||
## v9.3.0
|
||||
|
||||
capa v9.3.0 comes with over 20 new and/or impoved rules.
|
||||
For IDA users the capa explorer plugin is now available via the IDA Pro plugin repository and contains Qt compatibility layer for PyQt5 and PySide6 support.
|
||||
Additionally a Binary Ninja bug has been fixed. Released binaries now include ARM64 binaries (Linux and macOS).
|
||||
|
||||
### New Features
|
||||
|
||||
- ci: add support for arm64 binary releases
|
||||
- tests: run tests against IDA via idalib @williballenthin #2742
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
### New Rules (24)
|
||||
|
||||
- anti-analysis/anti-vm/vm-detection/detect-mouse-movement-via-activity-checks-on-windows tevajdr@gmail.com
|
||||
- nursery/create-executable-heap moritz.raabe@mandiant.com
|
||||
- anti-analysis/packer/dxpack/packed-with-dxpack jakubjozwiak@google.com
|
||||
- anti-analysis/anti-av/patch-bitdefender-hooking-dll-function jakubjozwiak@google.com
|
||||
- nursery/acquire-load-driver-privileges mehunhoff@google.com
|
||||
- nursery/communicate-using-ftp mehunhoff@google.com
|
||||
- linking/static/eclipse-paho-mqtt-c/linked-against-eclipse-paho-mqtt-c jakubjozwiak@google.com
|
||||
- linking/static/qmqtt/linked-against-qmqtt jakubjozwiak@google.com
|
||||
- anti-analysis/anti-forensic/disable-powershell-transcription jakubjozwiak@google.com
|
||||
- host-interaction/powershell/bypass-powershell-constrained-language-mode-via-getsystemlockdownpolicy-patch jakubjozwiak@google.com
|
||||
- linking/static/grpc/linked-against-grpc jakubjozwiak@google.com
|
||||
- linking/static/hp-socket/linked-against-hp-socket jakubjozwiak@google.com
|
||||
- load-code/execute-jscript-via-vsaengine-in-dotnet jakubjozwiak@google.com
|
||||
- linking/static/funchook/linked-against-funchook jakubjozwiak@google.com
|
||||
- linking/static/plthook/linked-against-plthook jakubjozwiak@google.com
|
||||
- host-interaction/network/enumerate-tcp-connections-via-wmi-com-api jakubjozwiak@google.com
|
||||
- host-interaction/network/routing-table/create-routing-table-entry jakubjozwiak@google.com
|
||||
- host-interaction/network/routing-table/get-routing-table michael.hunhoff@mandiant.com
|
||||
- host-interaction/file-system/use-io_uring-io-interface-on-linux jakubjozwiak@google.com
|
||||
- collection/keylog/log-keystrokes-via-direct-input zeze-zeze
|
||||
- nursery/compiled-from-fsharp mehunhoff@google.com
|
||||
- nursery/decrypt-data-using-aes-via-dotnet mehunhoff@google.com
|
||||
- nursery/get-dotnet-assembly-entry-point mehunhoff@google.com
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- binja: fix a crash during feature extraction when the MLIL is unavailable @xusheng6 #2714
|
||||
|
||||
### capa Explorer Web
|
||||
|
||||
### capa Explorer IDA Pro plugin
|
||||
|
||||
- add `ida-plugin.json` for inclusion in the IDA Pro plugin repository @williballenthin
|
||||
- ida plugin: add Qt compatibility layer for PyQt5 and PySide6 support @williballenthin #2707
|
||||
- delay import to not load Qt* when running under idalib @mr-tz #2752
|
||||
|
||||
### Development
|
||||
|
||||
- ci: remove redundant "test_run" action from build workflow @mike-hunhoff #2692
|
||||
- dev: add bumpmyversion to bump and sync versions across the project @mr-tz
|
||||
|
||||
### Raw diffs
|
||||
- [capa v9.2.1...9.3.0](https://github.com/mandiant/capa/compare/v9.2.1...9.3.0)
|
||||
- [capa-rules v9.2.1...9.3.0](https://github.com/mandiant/capa-rules/compare/v9.2.1...9.3.0)
|
||||
|
||||
## v9.2.1
|
||||
|
||||
This point release fixes bugs including removing an unnecessary PyInstaller warning message and enabling the standalone binary to execute on systems running older versions of glibc.
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- ci: exclude pkg_resources from PyInstaller build @mike-hunhoff #2684
|
||||
- ci: downgrade Ubuntu version to accommodate older glibc versions @mike-hunhoff #2684
|
||||
|
||||
### Development
|
||||
|
||||
- ci: upgrade Windows version to avoid deprecation @mike-hunhoff #2684
|
||||
- ci: check if build runs without warnings or errors @mike-hunhoff #2684
|
||||
|
||||
### Raw diffs
|
||||
- [capa v9.2.0...v9.2.1](https://github.com/mandiant/capa/compare/v9.2.0...v9.2.1)
|
||||
- [capa-rules v9.2.0...v9.2.1](https://github.com/mandiant/capa-rules/compare/v9.2.0...v9.2.1)
|
||||
|
||||
## v9.2.0
|
||||
|
||||
This release improves a few aspects of dynamic analysis, including relaxing our validation on fields across many CAPE versions and processing additional VMRay submission file types, for example.
|
||||
It also includes an updated rule pack containing new rules and rule fixes.
|
||||
|
||||
### New Features
|
||||
- vmray: do not restrict analysis to PE and ELF files, e.g. docx @mike-hunhoff #2672
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
### New Rules (22)
|
||||
|
||||
- communication/socket/connect-socket moritz.raabe@mandiant.com joakim@intezer.com mrhafizfarhad@gmail.com
|
||||
- communication/socket/udp/connect-udp-socket mrhafizfarhad@gmail.com
|
||||
- nursery/enter-debug-mode-in-dotnet @v1bh475u
|
||||
- nursery/decrypt-data-using-tripledes-in-dotnet 0xRavenspar
|
||||
- nursery/encrypt-data-using-tripledes-in-dotnet 0xRavenspar
|
||||
- nursery/disable-system-features-via-registry-on-windows mehunhoff@google.com
|
||||
- data-manipulation/encryption/chaskey/encrypt-data-using-chaskey still@teamt5.org
|
||||
- data-manipulation/encryption/speck/encrypt-data-using-speck still@teamt5.org
|
||||
- load-code/dotnet/load-assembly-via-iassembly still@teamt5.org
|
||||
- malware-family/donut-loader/load-shellcode-via-donut still@teamt5.org
|
||||
- nursery/disable-device-guard-features-via-registry-on-windows mehunhoff@google.com
|
||||
- nursery/disable-firewall-features-via-registry-on-windows mehunhoff@google.com
|
||||
- nursery/disable-system-restore-features-via-registry-on-windows mehunhoff@google.com
|
||||
- nursery/disable-windows-defender-features-via-registry-on-windows mehunhoff@google.com
|
||||
- host-interaction/file-system/write/clear-file-content jakeperalta7
|
||||
- host-interaction/filter/unload-minifilter-driver JakePeralta7
|
||||
- exploitation/enumeration/make-suspicious-ntquerysysteminformation-call zdw@google.com
|
||||
- exploitation/gadgets/load-ntoskrnl zdw@google.com
|
||||
- exploitation/gadgets/resolve-ntoskrnl-gadgets zdw@google.com
|
||||
- exploitation/spraying/make-suspicious-ntfscontrolfile-call zdw@google.com
|
||||
- anti-analysis/anti-forensic/unload-sysmon JakePeralta7
|
||||
|
||||
### Bug Fixes
|
||||
- cape: make some fields optional @williballenthin #2631 #2632
|
||||
- lint: add WARN for regex features that contain unescaped dot #2635
|
||||
- lint: add ERROR for incomplete registry control set regex #2643
|
||||
- binja: update unit test core version #2670
|
||||
|
||||
### Raw diffs
|
||||
- [capa v9.1.0...v9.2.0](https://github.com/mandiant/capa/compare/v9.1.0...v9.2.0)
|
||||
- [capa-rules v9.1.0...v9.2.0](https://github.com/mandiant/capa-rules/compare/v9.1.0...v9.2.0)
|
||||
|
||||
## v9.1.0
|
||||
|
||||
This release improves a few aspects of dynamic analysis, relaxing our validation on fields across many CAPE versions, for example.
|
||||
It also includes an updated rule pack in which many dynamic rules make better use of the "span of calls" scope.
|
||||
|
||||
|
||||
### New Rules (3)
|
||||
### New Rules (4)
|
||||
|
||||
- host-interaction/registry/change-registry-key-timestamp wballenthin@google.com
|
||||
- host-interaction/mutex/check-mutex-and-terminate-process-on-windows @_re_fox moritz.raabe@mandiant.com mehunhoff@google.com
|
||||
- anti-analysis/anti-forensic/clear-logs/clear-windows-event-logs-remotely 99.elad.levi@gmail.com
|
||||
-
|
||||
|
||||
### Bug Fixes
|
||||
- only parse CAPE fields required for analysis @mike-hunhoff #2607
|
||||
- main: render result document without needing associated rules @williballenthin #2610
|
||||
- vmray: only verify process OS and monitor IDs match @mike-hunhoff #2613
|
||||
- render: don't assume prior matches exist within a thread @mike-hunhoff #2612
|
||||
|
||||
### capa Explorer Web
|
||||
|
||||
### capa Explorer IDA Pro plugin
|
||||
|
||||
### Development
|
||||
|
||||
### Raw diffs
|
||||
- [capa v9.0.0...v9.1.0](https://github.com/mandiant/capa/compare/v9.0.0...v9.1.0)
|
||||
- [capa-rules v9.0.0...v9.1.0](https://github.com/mandiant/capa-rules/compare/v9.0.0...v9.1.0)
|
||||
- [capa v9.0.0...master](https://github.com/mandiant/capa/compare/v9.0.0...master)
|
||||
- [capa-rules v9.0.0...master](https://github.com/mandiant/capa-rules/compare/v9.0.0...master)
|
||||
|
||||
## v9.0.0
|
||||
|
||||
|
||||
38
README.md
38
README.md
@@ -87,33 +87,6 @@ Download stable releases of the standalone capa binaries [here](https://github.c
|
||||
|
||||
To use capa as a library or integrate with another tool, see [doc/installation.md](https://github.com/mandiant/capa/blob/master/doc/installation.md) for further setup instructions.
|
||||
|
||||
**Documentation:** [Usage and tips](doc/usage.md) · [Installation](doc/installation.md) · [Limitations](doc/limitations.md) · [FAQ](doc/faq.md)
|
||||
|
||||
# mapa html map
|
||||
|
||||
mapa can render a standalone HTML overview of functions, string tags, and referenced strings.
|
||||
|
||||
```bash
|
||||
python -m mapa binaries/01/16/mpbindump.exe --output html-map > report.html
|
||||
```
|
||||
|
||||
On the `mpbindump.exe` sample, the generated report contains:
|
||||
|
||||
```text
|
||||
doctype <!doctype html>
|
||||
functions 1406
|
||||
tags 12
|
||||
strings 81
|
||||
```
|
||||
|
||||
Use `--open` to write the report to a temporary file and open it in your browser.
|
||||
|
||||
```bash
|
||||
python -m mapa binaries/01/16/mpbindump.exe --output html-map --open
|
||||
```
|
||||
|
||||
The executable proof for this example is in `doc/demos/mapa-html-map.md`.
|
||||
|
||||
# capa Explorer Web
|
||||
The [capa Explorer Web](https://mandiant.github.io/capa/explorer/) enables you to interactively explore capa results in your web browser. Besides the online version you can download a standalone HTML file for local offline usage.
|
||||
|
||||
@@ -318,17 +291,11 @@ It also uses your local changes to the .idb to extract better features, such as
|
||||

|
||||
|
||||
# Ghidra integration
|
||||
|
||||
capa supports using Ghidra (via [PyGhidra](https://github.com/NationalSecurityAgency/ghidra/tree/master/Ghidra/Features/PyGhidra)) as a feature extraction backend. This allows you to run capa against binaries using Ghidra's analysis engine.
|
||||
|
||||
You can run and view capa results in the Ghidra UI using [capa explorer for Ghidra](https://github.com/mandiant/capa/tree/master/capa/ghidra/plugin).
|
||||
If you use Ghidra, then you can use the [capa + Ghidra integration](/capa/ghidra/) to run capa's analysis directly on your Ghidra database and render the results in Ghidra's user interface.
|
||||
|
||||
<img src="https://github.com/mandiant/capa/assets/66766340/eeae33f4-99d4-42dc-a5e8-4c1b8c661492" width=300>
|
||||
|
||||
You can also run capa from the command line using the [Ghidra backend](https://github.com/mandiant/capa/tree/master/capa/ghidra).
|
||||
|
||||
# blog posts
|
||||
- [Riding Dragons: capa Harnesses Ghidra](https://www.mandiant.com/resources/blog/capa-harnesses-ghidra)
|
||||
- [Dynamic capa: Exploring Executable Run-Time Behavior with the CAPE Sandbox](https://www.mandiant.com/resources/blog/dynamic-capa-executable-behavior-cape-sandbox)
|
||||
- [capa v4: casting a wider .NET](https://www.mandiant.com/resources/blog/capa-v4-casting-wider-net) (.NET support)
|
||||
- [ELFant in the Room – capa v3](https://www.mandiant.com/resources/elfant-in-the-room-capa-v3) (ELF support)
|
||||
@@ -348,6 +315,3 @@ You can also run capa from the command line using the [Ghidra backend](https://g
|
||||
|
||||
## capa testfiles
|
||||
The [capa-testfiles repository](https://github.com/mandiant/capa-testfiles) contains the data we use to test capa's code and rules
|
||||
|
||||
## mailing list
|
||||
Subscribe to the FLARE mailing list for community announcements! Email "subscribe" to [flare-external@google.com](mailto:flare-external@google.com?subject=subscribe).
|
||||
|
||||
@@ -277,9 +277,7 @@ def find_dynamic_capabilities(
|
||||
all_span_matches: MatchResults = collections.defaultdict(list)
|
||||
all_call_matches: MatchResults = collections.defaultdict(list)
|
||||
|
||||
# Accumulate into a list to avoid O(n²) tuple concatenation.
|
||||
# Tuples are immutable, so `t += (x,)` copies the entire tuple each time.
|
||||
process_feature_counts: list[rdoc.ProcessFeatureCount] = []
|
||||
feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=())
|
||||
|
||||
assert isinstance(extractor, DynamicFeatureExtractor)
|
||||
processes: list[ProcessHandle] = list(extractor.get_processes())
|
||||
@@ -291,10 +289,10 @@ def find_dynamic_capabilities(
|
||||
task = pbar.add_task("matching", total=n_processes, unit="processes")
|
||||
for p in processes:
|
||||
process_capabilities = find_process_capabilities(ruleset, extractor, p)
|
||||
process_feature_counts.append(
|
||||
feature_counts.processes += (
|
||||
rdoc.ProcessFeatureCount(
|
||||
address=frz.Address.from_capa(p.address), count=process_capabilities.feature_count
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
for rule_name, res in process_capabilities.process_matches.items():
|
||||
@@ -319,11 +317,7 @@ def find_dynamic_capabilities(
|
||||
capa.engine.index_rule_matches(process_and_lower_features, rule, locations)
|
||||
|
||||
all_file_capabilities = find_file_capabilities(ruleset, extractor, process_and_lower_features)
|
||||
|
||||
feature_counts = rdoc.DynamicFeatureCounts(
|
||||
file=all_file_capabilities.feature_count,
|
||||
processes=tuple(process_feature_counts),
|
||||
)
|
||||
feature_counts.file = all_file_capabilities.feature_count
|
||||
|
||||
matches = dict(
|
||||
itertools.chain(
|
||||
|
||||
@@ -156,11 +156,8 @@ def find_static_capabilities(
|
||||
all_bb_matches: MatchResults = collections.defaultdict(list)
|
||||
all_insn_matches: MatchResults = collections.defaultdict(list)
|
||||
|
||||
# Accumulate into lists to avoid O(n²) tuple concatenation.
|
||||
# Tuples are immutable, so `t += (x,)` copies the entire tuple each time.
|
||||
# For binaries with thousands of functions this becomes quadratic in memory work.
|
||||
function_feature_counts: list[rdoc.FunctionFeatureCount] = []
|
||||
library_functions_list: list[rdoc.LibraryFunction] = []
|
||||
feature_counts = rdoc.StaticFeatureCounts(file=0, functions=())
|
||||
library_functions: tuple[rdoc.LibraryFunction, ...] = ()
|
||||
|
||||
assert isinstance(extractor, StaticFeatureExtractor)
|
||||
functions: list[FunctionHandle] = list(extractor.get_functions())
|
||||
@@ -179,20 +176,20 @@ def find_static_capabilities(
|
||||
if extractor.is_library_function(f.address):
|
||||
function_name = extractor.get_function_name(f.address)
|
||||
logger.debug("skipping library function 0x%x (%s)", f.address, function_name)
|
||||
library_functions_list.append(
|
||||
rdoc.LibraryFunction(address=frz.Address.from_capa(f.address), name=function_name)
|
||||
library_functions += (
|
||||
rdoc.LibraryFunction(address=frz.Address.from_capa(f.address), name=function_name),
|
||||
)
|
||||
n_libs = len(library_functions_list)
|
||||
n_libs = len(library_functions)
|
||||
percentage = round(100 * (n_libs / n_funcs))
|
||||
pbar.update(task, postfix=f"skipped {n_libs} library functions, {percentage}%")
|
||||
pbar.advance(task)
|
||||
continue
|
||||
|
||||
code_capabilities = find_code_capabilities(ruleset, extractor, f)
|
||||
function_feature_counts.append(
|
||||
feature_counts.functions += (
|
||||
rdoc.FunctionFeatureCount(
|
||||
address=frz.Address.from_capa(f.address), count=code_capabilities.feature_count
|
||||
)
|
||||
),
|
||||
)
|
||||
t1 = time.time()
|
||||
|
||||
@@ -233,11 +230,7 @@ def find_static_capabilities(
|
||||
capa.engine.index_rule_matches(function_and_lower_features, rule, locations)
|
||||
|
||||
all_file_capabilities = find_file_capabilities(ruleset, extractor, function_and_lower_features)
|
||||
|
||||
feature_counts = rdoc.StaticFeatureCounts(
|
||||
file=all_file_capabilities.feature_count,
|
||||
functions=tuple(function_feature_counts),
|
||||
)
|
||||
feature_counts.file = all_file_capabilities.feature_count
|
||||
|
||||
matches: MatchResults = dict(
|
||||
itertools.chain(
|
||||
@@ -251,4 +244,4 @@ def find_static_capabilities(
|
||||
)
|
||||
)
|
||||
|
||||
return Capabilities(matches, feature_counts, tuple(library_functions_list))
|
||||
return Capabilities(matches, feature_counts, library_functions)
|
||||
|
||||
@@ -189,11 +189,6 @@ class _NoAddress(Address):
|
||||
def __lt__(self, other):
|
||||
return False
|
||||
|
||||
def __gt__(self, other):
|
||||
# Mixed-type comparison: (real_address < NO_ADDRESS) invokes this so sort works.
|
||||
# NoAddress sorts last.
|
||||
return other is not self
|
||||
|
||||
def __hash__(self):
|
||||
return hash(0)
|
||||
|
||||
|
||||
@@ -369,12 +369,6 @@ class Regex(String):
|
||||
else:
|
||||
return Result(False, _MatchedRegex(self, {}), [])
|
||||
|
||||
def get_value_str(self) -> str:
|
||||
# return the raw regex pattern, not the escaped version from String.get_value_str().
|
||||
# see #1909.
|
||||
assert isinstance(self.value, str)
|
||||
return self.value
|
||||
|
||||
def __str__(self):
|
||||
assert isinstance(self.value, str)
|
||||
return f"regex(string =~ {self.value})"
|
||||
|
||||
@@ -20,7 +20,6 @@ Proto files generated via protobuf v24.4:
|
||||
from BinExport2 at 6916731d5f6693c4a4f0a052501fd3bd92cfd08b
|
||||
https://github.com/google/binexport/blob/6916731/binexport2.proto
|
||||
"""
|
||||
|
||||
import io
|
||||
import hashlib
|
||||
import logging
|
||||
@@ -41,12 +40,16 @@ from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_binexport2(sample: Path) -> BinExport2:
|
||||
def get_binexport2_from_bytes(buf: bytes) -> BinExport2:
|
||||
be2: BinExport2 = BinExport2()
|
||||
be2.ParseFromString(sample.read_bytes())
|
||||
be2.ParseFromString(buf)
|
||||
return be2
|
||||
|
||||
|
||||
def get_binexport2(sample: Path) -> BinExport2:
|
||||
return get_binexport2_from_bytes(sample.read_bytes())
|
||||
|
||||
|
||||
def compute_common_prefix_length(m: str, n: str) -> int:
|
||||
# ensure #m < #n
|
||||
if len(n) < len(m):
|
||||
|
||||
@@ -84,14 +84,16 @@ def extract_insn_number_features(
|
||||
yield OperandOffset(i, value), ih.address
|
||||
|
||||
|
||||
OFFSET_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
OFFSET_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
|
||||
"""
|
||||
ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack), #int] ; capture #int
|
||||
ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack), #int]! ; capture #int
|
||||
ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack)], #int ; capture #int
|
||||
ldp|ldpd|stp|stpd reg, reg, [reg(not-stack), #int] ; capture #int
|
||||
ldp|ldpd|stp|stpd reg, reg, [reg(not-stack), #int]! ; capture #int
|
||||
ldp|ldpd|stp|stpd reg, reg, [reg(not-stack)], #int ; capture #int
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def extract_insn_offset_features(
|
||||
@@ -115,10 +117,12 @@ def extract_insn_offset_features(
|
||||
yield OperandOffset(match.operand_index, value), ih.address
|
||||
|
||||
|
||||
NZXOR_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
NZXOR_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
|
||||
"""
|
||||
eor reg, reg, reg
|
||||
eor reg, reg, #int
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def extract_insn_nzxor_characteristic_features(
|
||||
@@ -140,9 +144,11 @@ def extract_insn_nzxor_characteristic_features(
|
||||
yield Characteristic("nzxor"), ih.address
|
||||
|
||||
|
||||
INDIRECT_CALL_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
INDIRECT_CALL_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
|
||||
"""
|
||||
blx|bx|blr reg
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def extract_function_indirect_call_characteristic_features(
|
||||
|
||||
@@ -34,14 +34,17 @@ from capa.features.extractors.binexport2.arch.intel.helpers import SECURITY_COOK
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
IGNORE_NUMBER_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
IGNORE_NUMBER_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
|
||||
"""
|
||||
ret #int
|
||||
retn #int
|
||||
add reg(stack), #int
|
||||
sub reg(stack), #int
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
NUMBER_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
NUMBER_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
|
||||
"""
|
||||
push #int0 ; capture #int0
|
||||
|
||||
# its a little tedious to enumerate all the address forms
|
||||
@@ -61,7 +64,8 @@ NUMBER_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
# imagine reg is zero'd out, then this is like `mov reg, #int`
|
||||
# which is not uncommon.
|
||||
lea reg, [reg + #int] ; capture #int
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def extract_insn_number_features(
|
||||
@@ -96,7 +100,8 @@ def extract_insn_number_features(
|
||||
yield OperandOffset(match.operand_index, value), ih.address
|
||||
|
||||
|
||||
OFFSET_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
OFFSET_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
|
||||
"""
|
||||
mov|movzx|movsb|cmp [reg + reg * #int + #int0], #int ; capture #int0
|
||||
mov|movzx|movsb|cmp [reg * #int + #int0], #int ; capture #int0
|
||||
mov|movzx|movsb|cmp [reg + reg + #int0], #int ; capture #int0
|
||||
@@ -109,15 +114,18 @@ OFFSET_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
mov|movzx|movsb|cmp|lea reg, [reg * #int + #int0] ; capture #int0
|
||||
mov|movzx|movsb|cmp|lea reg, [reg + reg + #int0] ; capture #int0
|
||||
mov|movzx|movsb|cmp|lea reg, [reg(not-stack) + #int0] ; capture #int0
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
# these are patterns that access offset 0 from some pointer
|
||||
# (pointer is not the stack pointer).
|
||||
OFFSET_ZERO_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
OFFSET_ZERO_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
|
||||
"""
|
||||
mov|movzx|movsb [reg(not-stack)], reg
|
||||
mov|movzx|movsb [reg(not-stack)], #int
|
||||
lea reg, [reg(not-stack)]
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def extract_insn_offset_features(
|
||||
@@ -181,10 +189,12 @@ def is_security_cookie(
|
||||
return False
|
||||
|
||||
|
||||
NZXOR_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
NZXOR_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
|
||||
"""
|
||||
xor|xorpd|xorps|pxor reg, reg
|
||||
xor|xorpd|xorps|pxor reg, #int
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def extract_insn_nzxor_characteristic_features(
|
||||
@@ -218,7 +228,8 @@ def extract_insn_nzxor_characteristic_features(
|
||||
yield Characteristic("nzxor"), ih.address
|
||||
|
||||
|
||||
INDIRECT_CALL_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
INDIRECT_CALL_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
|
||||
"""
|
||||
call|jmp reg0
|
||||
call|jmp [reg + reg * #int + #int]
|
||||
call|jmp [reg + reg * #int]
|
||||
@@ -226,7 +237,8 @@ INDIRECT_CALL_PATTERNS = BinExport2InstructionPatternMatcher.from_str("""
|
||||
call|jmp [reg + reg + #int]
|
||||
call|jmp [reg + #int]
|
||||
call|jmp [reg]
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def extract_function_indirect_call_characteristic_features(
|
||||
|
||||
@@ -64,12 +64,17 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
|
||||
):
|
||||
continue
|
||||
|
||||
dll = ""
|
||||
if vertex.HasField("library_index"):
|
||||
library = be2.library[vertex.library_index]
|
||||
dll = library.name
|
||||
|
||||
if not vertex.HasField("mangled_name"):
|
||||
logger.debug("vertex %d does not have mangled_name", vertex_idx)
|
||||
continue
|
||||
|
||||
api_name: str = vertex.mangled_name
|
||||
for name in capa.features.extractors.helpers.generate_symbols("", api_name):
|
||||
for name in capa.features.extractors.helpers.generate_symbols(dll, api_name):
|
||||
yield API(name), ih.address
|
||||
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ from binaryninja import (
|
||||
Function,
|
||||
BinaryView,
|
||||
SymbolType,
|
||||
ILException,
|
||||
RegisterValueType,
|
||||
VariableSourceType,
|
||||
LowLevelILOperation,
|
||||
@@ -191,8 +192,9 @@ def extract_stackstring(fh: FunctionHandle):
|
||||
if bv is None:
|
||||
return
|
||||
|
||||
mlil = func.mlil
|
||||
if mlil is None:
|
||||
try:
|
||||
mlil = func.mlil
|
||||
except ILException:
|
||||
return
|
||||
|
||||
for block in mlil.basic_blocks:
|
||||
|
||||
@@ -35,7 +35,7 @@ from capa.features.extractors.base_extractor import (
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
TESTED_VERSIONS = {"2.2-CAPE", "2.4-CAPE", "2.5-CAPE"}
|
||||
TESTED_VERSIONS = {"2.2-CAPE", "2.4-CAPE"}
|
||||
|
||||
|
||||
class CapeExtractor(DynamicFeatureExtractor):
|
||||
@@ -54,8 +54,7 @@ class CapeExtractor(DynamicFeatureExtractor):
|
||||
|
||||
def get_base_address(self) -> Union[AbsoluteVirtualAddress, _NoAddress, None]:
|
||||
# value according to the PE header, the actual trace may use a different imagebase
|
||||
assert self.report.static is not None
|
||||
assert self.report.static.pe is not None
|
||||
assert self.report.static is not None and self.report.static.pe is not None
|
||||
return AbsoluteVirtualAddress(self.report.static.pe.imagebase)
|
||||
|
||||
def extract_global_features(self) -> Iterator[tuple[Feature, Address]]:
|
||||
|
||||
@@ -88,49 +88,31 @@ def extract_file_strings(report: CapeReport) -> Iterator[tuple[Feature, Address]
|
||||
|
||||
|
||||
def extract_used_regkeys(report: CapeReport) -> Iterator[tuple[Feature, Address]]:
|
||||
if not report.behavior.summary:
|
||||
return
|
||||
|
||||
for regkey in report.behavior.summary.keys:
|
||||
yield String(regkey), NO_ADDRESS
|
||||
|
||||
|
||||
def extract_used_files(report: CapeReport) -> Iterator[tuple[Feature, Address]]:
|
||||
if not report.behavior.summary:
|
||||
return
|
||||
|
||||
for file in report.behavior.summary.files:
|
||||
yield String(file), NO_ADDRESS
|
||||
|
||||
|
||||
def extract_used_mutexes(report: CapeReport) -> Iterator[tuple[Feature, Address]]:
|
||||
if not report.behavior.summary:
|
||||
return
|
||||
|
||||
for mutex in report.behavior.summary.mutexes:
|
||||
yield String(mutex), NO_ADDRESS
|
||||
|
||||
|
||||
def extract_used_commands(report: CapeReport) -> Iterator[tuple[Feature, Address]]:
|
||||
if not report.behavior.summary:
|
||||
return
|
||||
|
||||
for cmd in report.behavior.summary.executed_commands:
|
||||
yield String(cmd), NO_ADDRESS
|
||||
|
||||
|
||||
def extract_used_apis(report: CapeReport) -> Iterator[tuple[Feature, Address]]:
|
||||
if not report.behavior.summary:
|
||||
return
|
||||
|
||||
for symbol in report.behavior.summary.resolved_apis:
|
||||
yield String(symbol), NO_ADDRESS
|
||||
|
||||
|
||||
def extract_used_services(report: CapeReport) -> Iterator[tuple[Feature, Address]]:
|
||||
if not report.behavior.summary:
|
||||
return
|
||||
|
||||
for svc in report.behavior.summary.created_services:
|
||||
yield String(svc), NO_ADDRESS
|
||||
for svc in report.behavior.summary.started_services:
|
||||
|
||||
@@ -188,15 +188,15 @@ class PE(FlexibleModel):
|
||||
# timestamp: str
|
||||
|
||||
# list[ImportedDll], or dict[basename(dll), ImportedDll]
|
||||
imports: list[ImportedDll] | dict[str, ImportedDll] = Field(default_factory=list) # type: ignore
|
||||
imports: Union[list[ImportedDll], dict[str, ImportedDll]]
|
||||
# imported_dll_count: Optional[int] = None
|
||||
# imphash: str
|
||||
|
||||
# exported_dll_name: Optional[str] = None
|
||||
exports: list[ExportedSymbol] = Field(default_factory=list)
|
||||
exports: list[ExportedSymbol]
|
||||
|
||||
# dirents: list[DirectoryEntry]
|
||||
sections: list[Section] = Field(default_factory=list)
|
||||
sections: list[Section]
|
||||
|
||||
# ep_bytes: Optional[HexBytes] = None
|
||||
|
||||
@@ -364,7 +364,7 @@ class EncryptedBuffer(FlexibleModel):
|
||||
|
||||
|
||||
class Behavior(FlexibleModel):
|
||||
summary: Summary | None = None
|
||||
summary: Summary
|
||||
|
||||
# list of processes, of threads, of calls
|
||||
processes: list[Process]
|
||||
|
||||
@@ -27,12 +27,7 @@ import capa.features.extractors.dnfile.file
|
||||
import capa.features.extractors.dnfile.insn
|
||||
import capa.features.extractors.dnfile.function
|
||||
from capa.features.common import Feature
|
||||
from capa.features.address import (
|
||||
NO_ADDRESS,
|
||||
Address,
|
||||
DNTokenAddress,
|
||||
DNTokenOffsetAddress,
|
||||
)
|
||||
from capa.features.address import NO_ADDRESS, Address, DNTokenAddress, DNTokenOffsetAddress
|
||||
from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod
|
||||
from capa.features.extractors.base_extractor import (
|
||||
BBHandle,
|
||||
@@ -44,7 +39,6 @@ from capa.features.extractors.base_extractor import (
|
||||
from capa.features.extractors.dnfile.helpers import (
|
||||
get_dotnet_types,
|
||||
get_dotnet_fields,
|
||||
load_dotnet_image,
|
||||
get_dotnet_managed_imports,
|
||||
get_dotnet_managed_methods,
|
||||
get_dotnet_unmanaged_imports,
|
||||
@@ -89,7 +83,7 @@ class DnFileFeatureExtractorCache:
|
||||
|
||||
class DnfileFeatureExtractor(StaticFeatureExtractor):
|
||||
def __init__(self, path: Path):
|
||||
self.pe = load_dotnet_image(path)
|
||||
self.pe: dnfile.dnPE = dnfile.dnPE(str(path))
|
||||
super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes()))
|
||||
|
||||
# pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction
|
||||
@@ -118,12 +112,7 @@ class DnfileFeatureExtractor(StaticFeatureExtractor):
|
||||
fh: FunctionHandle = FunctionHandle(
|
||||
address=DNTokenAddress(token),
|
||||
inner=method,
|
||||
ctx={
|
||||
"pe": self.pe,
|
||||
"calls_from": set(),
|
||||
"calls_to": set(),
|
||||
"cache": self.token_cache,
|
||||
},
|
||||
ctx={"pe": self.pe, "calls_from": set(), "calls_to": set(), "cache": self.token_cache},
|
||||
)
|
||||
|
||||
# method tokens should be unique
|
||||
|
||||
@@ -15,10 +15,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import struct
|
||||
import logging
|
||||
from typing import Union, Iterator, Optional
|
||||
from pathlib import Path
|
||||
|
||||
import dnfile
|
||||
from dncil.cil.body import CilMethodBody
|
||||
@@ -32,16 +30,6 @@ from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_dotnet_image(path: Path) -> dnfile.dnPE:
|
||||
"""load a .NET PE file, raising CorruptFile on struct.error with the original error message."""
|
||||
try:
|
||||
return dnfile.dnPE(str(path))
|
||||
except struct.error as e:
|
||||
from capa.loader import CorruptFile
|
||||
|
||||
raise CorruptFile(f"Invalid or truncated .NET metadata: {e}") from e
|
||||
|
||||
|
||||
class DnfileMethodBodyReader(CilMethodBodyReaderBase):
|
||||
def __init__(self, pe: dnfile.dnPE, row: dnfile.mdtable.MethodDefRow):
|
||||
self.pe: dnfile.dnPE = pe
|
||||
@@ -163,9 +151,7 @@ def get_dotnet_managed_imports(pe: dnfile.dnPE) -> Iterator[DnType]:
|
||||
)
|
||||
|
||||
|
||||
def get_dotnet_methoddef_property_accessors(
|
||||
pe: dnfile.dnPE,
|
||||
) -> Iterator[tuple[int, str]]:
|
||||
def get_dotnet_methoddef_property_accessors(pe: dnfile.dnPE) -> Iterator[tuple[int, str]]:
|
||||
"""get MethodDef methods used to access properties
|
||||
|
||||
see https://www.ntcore.com/files/dotnetformat.htm
|
||||
@@ -240,13 +226,7 @@ def get_dotnet_managed_methods(pe: dnfile.dnPE) -> Iterator[DnType]:
|
||||
|
||||
typedefnamespace, typedefname = resolve_nested_typedef_name(nested_class_table, rid, typedef, pe)
|
||||
|
||||
yield DnType(
|
||||
token,
|
||||
typedefname,
|
||||
namespace=typedefnamespace,
|
||||
member=method_name,
|
||||
access=access,
|
||||
)
|
||||
yield DnType(token, typedefname, namespace=typedefnamespace, member=method_name, access=access)
|
||||
|
||||
|
||||
def get_dotnet_fields(pe: dnfile.dnPE) -> Iterator[DnType]:
|
||||
@@ -279,9 +259,7 @@ def get_dotnet_fields(pe: dnfile.dnPE) -> Iterator[DnType]:
|
||||
yield DnType(token, typedefname, namespace=typedefnamespace, member=field.row.Name)
|
||||
|
||||
|
||||
def get_dotnet_managed_method_bodies(
|
||||
pe: dnfile.dnPE,
|
||||
) -> Iterator[tuple[int, CilMethodBody]]:
|
||||
def get_dotnet_managed_method_bodies(pe: dnfile.dnPE) -> Iterator[tuple[int, CilMethodBody]]:
|
||||
"""get managed methods from MethodDef table"""
|
||||
for rid, method_def in iter_dotnet_table(pe, dnfile.mdtable.MethodDef.number):
|
||||
assert isinstance(method_def, dnfile.mdtable.MethodDefRow)
|
||||
@@ -360,10 +338,7 @@ def get_dotnet_table_row(pe: dnfile.dnPE, table_index: int, row_index: int) -> O
|
||||
|
||||
|
||||
def resolve_nested_typedef_name(
|
||||
nested_class_table: dict,
|
||||
index: int,
|
||||
typedef: dnfile.mdtable.TypeDefRow,
|
||||
pe: dnfile.dnPE,
|
||||
nested_class_table: dict, index: int, typedef: dnfile.mdtable.TypeDefRow, pe: dnfile.dnPE
|
||||
) -> tuple[str, tuple[str, ...]]:
|
||||
"""Resolves all nested TypeDef class names. Returns the namespace as a str and the nested TypeRef name as a tuple"""
|
||||
|
||||
|
||||
@@ -42,7 +42,6 @@ from capa.features.extractors.dnfile.types import DnType
|
||||
from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor
|
||||
from capa.features.extractors.dnfile.helpers import (
|
||||
iter_dotnet_table,
|
||||
load_dotnet_image,
|
||||
is_dotnet_mixed_mode,
|
||||
get_dotnet_managed_imports,
|
||||
get_dotnet_managed_methods,
|
||||
@@ -185,8 +184,8 @@ GLOBAL_HANDLERS = (
|
||||
class DotnetFileFeatureExtractor(StaticFeatureExtractor):
|
||||
def __init__(self, path: Path):
|
||||
super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes()))
|
||||
self.path = path
|
||||
self.pe = load_dotnet_image(path)
|
||||
self.path: Path = path
|
||||
self.pe: dnfile.dnPE = dnfile.dnPE(str(path))
|
||||
|
||||
def get_base_address(self):
|
||||
return NO_ADDRESS
|
||||
@@ -218,10 +217,7 @@ class DotnetFileFeatureExtractor(StaticFeatureExtractor):
|
||||
assert self.pe.net.struct.MajorRuntimeVersion is not None
|
||||
assert self.pe.net.struct.MinorRuntimeVersion is not None
|
||||
|
||||
return (
|
||||
self.pe.net.struct.MajorRuntimeVersion,
|
||||
self.pe.net.struct.MinorRuntimeVersion,
|
||||
)
|
||||
return self.pe.net.struct.MajorRuntimeVersion, self.pe.net.struct.MinorRuntimeVersion
|
||||
|
||||
def get_meta_version_string(self) -> str:
|
||||
assert self.pe.net is not None
|
||||
|
||||
@@ -83,7 +83,7 @@ def bb_contains_stackstring(bb: ghidra.program.model.block.CodeBlock) -> bool:
|
||||
true if basic block contains enough moves of constant bytes to the stack
|
||||
"""
|
||||
count = 0
|
||||
for insn in capa.features.extractors.ghidra.helpers.get_current_program().getListing().getInstructions(bb, True):
|
||||
for insn in currentProgram().getListing().getInstructions(bb, True): # type: ignore [name-defined] # noqa: F821
|
||||
if is_mov_imm_to_stack(insn):
|
||||
count += get_printable_len(insn.getScalar(1))
|
||||
if count > MIN_STACKSTRING_LEN:
|
||||
@@ -96,9 +96,7 @@ def _bb_has_tight_loop(bb: ghidra.program.model.block.CodeBlock):
|
||||
parse tight loops, true if last instruction in basic block branches to bb start
|
||||
"""
|
||||
# Reverse Ordered, first InstructionDB
|
||||
last_insn = (
|
||||
capa.features.extractors.ghidra.helpers.get_current_program().getListing().getInstructions(bb, False).next()
|
||||
)
|
||||
last_insn = currentProgram().getListing().getInstructions(bb, False).next() # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
if last_insn.getFlowType().isJump():
|
||||
return last_insn.getAddress(0) == bb.getMinAddress()
|
||||
@@ -142,3 +140,20 @@ def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Featur
|
||||
for bb_handler in BASIC_BLOCK_HANDLERS:
|
||||
for feature, addr in bb_handler(fh, bbh):
|
||||
yield feature, addr
|
||||
|
||||
|
||||
def main():
|
||||
features = []
|
||||
from capa.features.extractors.ghidra.extractor import GhidraFeatureExtractor
|
||||
|
||||
for fh in GhidraFeatureExtractor().get_functions():
|
||||
for bbh in capa.features.extractors.ghidra.helpers.get_function_blocks(fh):
|
||||
features.extend(list(extract_features(fh, bbh)))
|
||||
|
||||
import pprint
|
||||
|
||||
pprint.pprint(features) # noqa: T203
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
# Copyright 2023 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class GhidraContext:
|
||||
"""
|
||||
State holder for the Ghidra backend to avoid passing state to every function.
|
||||
|
||||
PyGhidra uses a context manager to set up the Ghidra environment (program, transaction, etc.).
|
||||
We store the relevant objects here to allow easy access throughout the extractor
|
||||
without needing to pass them as arguments to every feature extraction method.
|
||||
"""
|
||||
|
||||
def __init__(self, program, flat_api, monitor):
|
||||
self.program = program
|
||||
self.flat_api = flat_api
|
||||
self.monitor = monitor
|
||||
|
||||
|
||||
_context: Optional[GhidraContext] = None
|
||||
|
||||
|
||||
def set_context(program, flat_api, monitor):
|
||||
global _context
|
||||
_context = GhidraContext(program, flat_api, monitor)
|
||||
|
||||
|
||||
def get_context() -> GhidraContext:
|
||||
if _context is None:
|
||||
raise RuntimeError("GhidraContext not initialized")
|
||||
return _context
|
||||
@@ -12,14 +12,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import weakref
|
||||
import contextlib
|
||||
from typing import Iterator
|
||||
|
||||
import capa.features.extractors.ghidra.file
|
||||
import capa.features.extractors.ghidra.insn
|
||||
import capa.features.extractors.ghidra.global_
|
||||
import capa.features.extractors.ghidra.helpers as ghidra_helpers
|
||||
import capa.features.extractors.ghidra.function
|
||||
import capa.features.extractors.ghidra.basicblock
|
||||
from capa.features.common import Feature
|
||||
@@ -34,20 +31,19 @@ from capa.features.extractors.base_extractor import (
|
||||
|
||||
|
||||
class GhidraFeatureExtractor(StaticFeatureExtractor):
|
||||
def __init__(self, ctx_manager=None, tmpdir=None):
|
||||
self.ctx_manager = ctx_manager
|
||||
self.tmpdir = tmpdir
|
||||
def __init__(self):
|
||||
import capa.features.extractors.ghidra.helpers as ghidra_helpers
|
||||
|
||||
super().__init__(
|
||||
SampleHashes(
|
||||
md5=ghidra_helpers.get_current_program().getExecutableMD5(),
|
||||
md5=capa.ghidra.helpers.get_file_md5(),
|
||||
# ghidra doesn't expose this hash.
|
||||
# https://ghidra.re/ghidra_docs/api/ghidra/program/model/listing/Program.html
|
||||
#
|
||||
# the hashes are stored in the database, not computed on the fly,
|
||||
# so it's probably not trivial to add SHA1.
|
||||
sha1="",
|
||||
sha256=ghidra_helpers.get_current_program().getExecutableSHA256(),
|
||||
sha256=capa.ghidra.helpers.get_file_sha256(),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -59,14 +55,8 @@ class GhidraFeatureExtractor(StaticFeatureExtractor):
|
||||
self.externs = ghidra_helpers.get_file_externs()
|
||||
self.fakes = ghidra_helpers.map_fake_import_addrs()
|
||||
|
||||
# Register cleanup to run when the extractor is garbage collected or when the program exits.
|
||||
# We use weakref.finalize instead of __del__ to avoid issues with reference cycles and
|
||||
# to ensure deterministic cleanup on interpreter shutdown.
|
||||
if self.ctx_manager or self.tmpdir:
|
||||
weakref.finalize(self, cleanup, self.ctx_manager, self.tmpdir)
|
||||
|
||||
def get_base_address(self):
|
||||
return AbsoluteVirtualAddress(ghidra_helpers.get_current_program().getImageBase().getOffset())
|
||||
return AbsoluteVirtualAddress(currentProgram().getImageBase().getOffset()) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
def extract_global_features(self):
|
||||
yield from self.global_features
|
||||
@@ -75,6 +65,7 @@ class GhidraFeatureExtractor(StaticFeatureExtractor):
|
||||
yield from capa.features.extractors.ghidra.file.extract_features()
|
||||
|
||||
def get_functions(self) -> Iterator[FunctionHandle]:
|
||||
import capa.features.extractors.ghidra.helpers as ghidra_helpers
|
||||
|
||||
for fhandle in ghidra_helpers.get_function_symbols():
|
||||
fh: FunctionHandle = FunctionHandle(
|
||||
@@ -86,14 +77,14 @@ class GhidraFeatureExtractor(StaticFeatureExtractor):
|
||||
|
||||
@staticmethod
|
||||
def get_function(addr: int) -> FunctionHandle:
|
||||
|
||||
func = ghidra_helpers.get_flat_api().getFunctionContaining(ghidra_helpers.get_flat_api().toAddr(addr))
|
||||
func = getFunctionContaining(toAddr(addr)) # type: ignore [name-defined] # noqa: F821
|
||||
return FunctionHandle(address=AbsoluteVirtualAddress(func.getEntryPoint().getOffset()), inner=func)
|
||||
|
||||
def extract_function_features(self, fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
yield from capa.features.extractors.ghidra.function.extract_features(fh)
|
||||
|
||||
def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
|
||||
import capa.features.extractors.ghidra.helpers as ghidra_helpers
|
||||
|
||||
yield from ghidra_helpers.get_function_blocks(fh)
|
||||
|
||||
@@ -101,17 +92,9 @@ class GhidraFeatureExtractor(StaticFeatureExtractor):
|
||||
yield from capa.features.extractors.ghidra.basicblock.extract_features(fh, bbh)
|
||||
|
||||
def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
|
||||
import capa.features.extractors.ghidra.helpers as ghidra_helpers
|
||||
|
||||
yield from ghidra_helpers.get_insn_in_range(bbh)
|
||||
|
||||
def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle):
|
||||
yield from capa.features.extractors.ghidra.insn.extract_features(fh, bbh, ih)
|
||||
|
||||
|
||||
def cleanup(ctx_manager, tmpdir):
|
||||
if ctx_manager:
|
||||
with contextlib.suppress(Exception):
|
||||
ctx_manager.__exit__(None, None, None)
|
||||
if tmpdir:
|
||||
with contextlib.suppress(Exception):
|
||||
tmpdir.cleanup()
|
||||
|
||||
@@ -80,54 +80,22 @@ def extract_file_embedded_pe() -> Iterator[tuple[Feature, Address]]:
|
||||
for i in range(256)
|
||||
]
|
||||
|
||||
for block in capa.features.extractors.ghidra.helpers.get_current_program().getMemory().getBlocks():
|
||||
for block in currentProgram().getMemory().getBlocks(): # type: ignore [name-defined] # noqa: F821
|
||||
if not all((block.isLoaded(), block.isInitialized(), "Headers" not in block.getName())):
|
||||
continue
|
||||
|
||||
for off, _ in find_embedded_pe(capa.features.extractors.ghidra.helpers.get_block_bytes(block), mz_xor):
|
||||
# add offset back to block start
|
||||
ea_addr = block.getStart().add(off)
|
||||
ea = ea_addr.getOffset()
|
||||
f_offset = capa.features.extractors.ghidra.helpers.get_file_offset(ea_addr)
|
||||
if f_offset != -1:
|
||||
ea = f_offset
|
||||
ea: int = block.getStart().add(off).getOffset()
|
||||
|
||||
yield Characteristic("embedded pe"), FileOffsetAddress(ea)
|
||||
|
||||
|
||||
def extract_file_export_names() -> Iterator[tuple[Feature, Address]]:
|
||||
"""extract function exports"""
|
||||
program = capa.features.extractors.ghidra.helpers.get_current_program()
|
||||
st = program.getSymbolTable()
|
||||
|
||||
st = currentProgram().getSymbolTable() # type: ignore [name-defined] # noqa: F821
|
||||
for addr in st.getExternalEntryPointIterator():
|
||||
sym = st.getPrimarySymbol(addr)
|
||||
name = sym.getName()
|
||||
|
||||
# Check for forwarded export
|
||||
is_forwarded = False
|
||||
refs = program.getReferenceManager().getReferencesFrom(addr)
|
||||
for ref in refs:
|
||||
if ref.getToAddress().isExternalAddress():
|
||||
ext_sym = st.getPrimarySymbol(ref.getToAddress())
|
||||
if ext_sym:
|
||||
ext_loc = program.getExternalManager().getExternalLocation(ext_sym)
|
||||
if ext_loc:
|
||||
# It is a forwarded export
|
||||
libname = ext_loc.getLibraryName()
|
||||
if libname.lower().endswith(".dll"):
|
||||
libname = libname[:-4]
|
||||
|
||||
forwarded_name = f"{libname}.{ext_loc.getLabel()}"
|
||||
forwarded_name = capa.features.extractors.helpers.reformat_forwarded_export_name(forwarded_name)
|
||||
|
||||
yield Export(forwarded_name), AbsoluteVirtualAddress(addr.getOffset())
|
||||
yield Characteristic("forwarded export"), AbsoluteVirtualAddress(addr.getOffset())
|
||||
is_forwarded = True
|
||||
break
|
||||
|
||||
if not is_forwarded:
|
||||
yield Export(name), AbsoluteVirtualAddress(addr.getOffset())
|
||||
yield Export(st.getPrimarySymbol(addr).getName()), AbsoluteVirtualAddress(addr.getOffset())
|
||||
|
||||
|
||||
def extract_file_import_names() -> Iterator[tuple[Feature, Address]]:
|
||||
@@ -142,7 +110,7 @@ def extract_file_import_names() -> Iterator[tuple[Feature, Address]]:
|
||||
- importname
|
||||
"""
|
||||
|
||||
for f in capa.features.extractors.ghidra.helpers.get_current_program().getFunctionManager().getExternalFunctions():
|
||||
for f in currentProgram().getFunctionManager().getExternalFunctions(): # type: ignore [name-defined] # noqa: F821
|
||||
for r in f.getSymbol().getReferences():
|
||||
if r.getReferenceType().isData():
|
||||
addr = r.getFromAddress().getOffset() # gets pointer to fake external addr
|
||||
@@ -158,14 +126,14 @@ def extract_file_import_names() -> Iterator[tuple[Feature, Address]]:
|
||||
def extract_file_section_names() -> Iterator[tuple[Feature, Address]]:
|
||||
"""extract section names"""
|
||||
|
||||
for block in capa.features.extractors.ghidra.helpers.get_current_program().getMemory().getBlocks():
|
||||
for block in currentProgram().getMemory().getBlocks(): # type: ignore [name-defined] # noqa: F821
|
||||
yield Section(block.getName()), AbsoluteVirtualAddress(block.getStart().getOffset())
|
||||
|
||||
|
||||
def extract_file_strings() -> Iterator[tuple[Feature, Address]]:
|
||||
"""extract ASCII and UTF-16 LE strings"""
|
||||
|
||||
for block in capa.features.extractors.ghidra.helpers.get_current_program().getMemory().getBlocks():
|
||||
for block in currentProgram().getMemory().getBlocks(): # type: ignore [name-defined] # noqa: F821
|
||||
if not block.isInitialized():
|
||||
continue
|
||||
|
||||
@@ -185,8 +153,7 @@ def extract_file_function_names() -> Iterator[tuple[Feature, Address]]:
|
||||
extract the names of statically-linked library functions.
|
||||
"""
|
||||
|
||||
for sym in capa.features.extractors.ghidra.helpers.get_current_program().getSymbolTable().getAllSymbols(True):
|
||||
|
||||
for sym in currentProgram().getSymbolTable().getAllSymbols(True): # type: ignore [name-defined] # noqa: F821
|
||||
# .isExternal() misses more than this config for the function symbols
|
||||
if sym.getSymbolType() == SymbolType.FUNCTION and sym.getSource() == SourceType.ANALYSIS and sym.isGlobal():
|
||||
name = sym.getName() # starts to resolve names based on Ghidra's FidDB
|
||||
@@ -203,7 +170,7 @@ def extract_file_function_names() -> Iterator[tuple[Feature, Address]]:
|
||||
|
||||
|
||||
def extract_file_format() -> Iterator[tuple[Feature, Address]]:
|
||||
ef = capa.features.extractors.ghidra.helpers.get_current_program().getExecutableFormat()
|
||||
ef = currentProgram().getExecutableFormat() # type: ignore [name-defined] # noqa: F821
|
||||
if "PE" in ef:
|
||||
yield Format(FORMAT_PE), NO_ADDRESS
|
||||
elif "ELF" in ef:
|
||||
@@ -231,3 +198,14 @@ FILE_HANDLERS = (
|
||||
extract_file_function_names,
|
||||
extract_file_format,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
""" """
|
||||
import pprint
|
||||
|
||||
pprint.pprint(list(extract_features())) # noqa: T203
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -26,25 +26,21 @@ from capa.features.extractors.base_extractor import FunctionHandle
|
||||
|
||||
def extract_function_calls_to(fh: FunctionHandle):
|
||||
"""extract callers to a function"""
|
||||
f: "ghidra.program.database.function.FunctionDB" = fh.inner
|
||||
f: ghidra.program.database.function.FunctionDB = fh.inner
|
||||
for ref in f.getSymbol().getReferences():
|
||||
if ref.getReferenceType().isCall():
|
||||
yield Characteristic("calls to"), AbsoluteVirtualAddress(ref.getFromAddress().getOffset())
|
||||
|
||||
|
||||
def extract_function_loop(fh: FunctionHandle):
|
||||
f: "ghidra.program.database.function.FunctionDB" = fh.inner
|
||||
f: ghidra.program.database.function.FunctionDB = fh.inner
|
||||
|
||||
edges = []
|
||||
for block in SimpleBlockIterator(
|
||||
BasicBlockModel(capa.features.extractors.ghidra.helpers.get_current_program()),
|
||||
f.getBody(),
|
||||
capa.features.extractors.ghidra.helpers.get_monitor(),
|
||||
):
|
||||
dests = block.getDestinations(capa.features.extractors.ghidra.helpers.get_monitor())
|
||||
for block in SimpleBlockIterator(BasicBlockModel(currentProgram()), f.getBody(), monitor()): # type: ignore [name-defined] # noqa: F821
|
||||
dests = block.getDestinations(monitor()) # type: ignore [name-defined] # noqa: F821
|
||||
s_addrs = block.getStartAddresses()
|
||||
|
||||
while dests.hasNext():
|
||||
while dests.hasNext(): # For loop throws Python TypeError
|
||||
for addr in s_addrs:
|
||||
edges.append((addr.getOffset(), dests.next().getDestinationAddress().getOffset()))
|
||||
|
||||
@@ -53,17 +49,32 @@ def extract_function_loop(fh: FunctionHandle):
|
||||
|
||||
|
||||
def extract_recursive_call(fh: FunctionHandle):
|
||||
f: "ghidra.program.database.function.FunctionDB" = fh.inner
|
||||
f: ghidra.program.database.function.FunctionDB = fh.inner
|
||||
|
||||
for func in f.getCalledFunctions(capa.features.extractors.ghidra.helpers.get_monitor()):
|
||||
for func in f.getCalledFunctions(monitor()): # type: ignore [name-defined] # noqa: F821
|
||||
if func.getEntryPoint().getOffset() == f.getEntryPoint().getOffset():
|
||||
yield Characteristic("recursive call"), AbsoluteVirtualAddress(f.getEntryPoint().getOffset())
|
||||
|
||||
|
||||
def extract_features(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
for function_handler in FUNCTION_HANDLERS:
|
||||
for feature, addr in function_handler(fh):
|
||||
for func_handler in FUNCTION_HANDLERS:
|
||||
for feature, addr in func_handler(fh):
|
||||
yield feature, addr
|
||||
|
||||
|
||||
FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call)
|
||||
|
||||
|
||||
def main():
|
||||
""" """
|
||||
features = []
|
||||
for fhandle in capa.features.extractors.ghidra.helpers.get_function_symbols():
|
||||
features.extend(list(extract_features(fhandle)))
|
||||
|
||||
import pprint
|
||||
|
||||
pprint.pprint(features) # noqa: T203
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_os() -> Iterator[tuple[Feature, Address]]:
|
||||
format_name: str = capa.features.extractors.ghidra.helpers.get_current_program().getExecutableFormat()
|
||||
format_name: str = currentProgram().getExecutableFormat() # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
if "PE" in format_name:
|
||||
yield OS(OS_WINDOWS), NO_ADDRESS
|
||||
@@ -53,7 +53,7 @@ def extract_os() -> Iterator[tuple[Feature, Address]]:
|
||||
|
||||
|
||||
def extract_arch() -> Iterator[tuple[Feature, Address]]:
|
||||
lang_id = capa.features.extractors.ghidra.helpers.get_current_program().getMetadata().get("Language ID")
|
||||
lang_id = currentProgram().getMetadata().get("Language ID") # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
if "x86" in lang_id and "64" in lang_id:
|
||||
yield Arch(ARCH_AMD64), NO_ADDRESS
|
||||
|
||||
@@ -22,22 +22,9 @@ from ghidra.program.model.symbol import SourceType, SymbolType
|
||||
from ghidra.program.model.address import AddressSpace
|
||||
|
||||
import capa.features.extractors.helpers
|
||||
import capa.features.extractors.ghidra.context as ghidra_context
|
||||
from capa.features.common import THUNK_CHAIN_DEPTH_DELTA
|
||||
from capa.features.address import AbsoluteVirtualAddress
|
||||
from capa.features.extractors.base_extractor import BBHandle, InsnHandle
|
||||
|
||||
|
||||
def get_current_program():
|
||||
return ghidra_context.get_context().program
|
||||
|
||||
|
||||
def get_monitor():
|
||||
return ghidra_context.get_context().monitor
|
||||
|
||||
|
||||
def get_flat_api():
|
||||
return ghidra_context.get_context().flat_api
|
||||
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
|
||||
|
||||
|
||||
def ints_to_bytes(bytez: list[int]) -> bytes:
|
||||
@@ -49,7 +36,7 @@ def ints_to_bytes(bytez: list[int]) -> bytes:
|
||||
return bytes([b & 0xFF for b in bytez])
|
||||
|
||||
|
||||
def find_byte_sequence(addr: "ghidra.program.model.address.Address", seq: bytes) -> Iterator[int]:
|
||||
def find_byte_sequence(addr: ghidra.program.model.address.Address, seq: bytes) -> Iterator[int]:
|
||||
"""yield all ea of a given byte sequence
|
||||
|
||||
args:
|
||||
@@ -57,25 +44,12 @@ def find_byte_sequence(addr: "ghidra.program.model.address.Address", seq: bytes)
|
||||
seq: bytes to search e.g. b"\x01\x03"
|
||||
"""
|
||||
seqstr = "".join([f"\\x{b:02x}" for b in seq])
|
||||
eas = get_flat_api().findBytes(addr, seqstr, java.lang.Integer.MAX_VALUE, 1)
|
||||
eas = findBytes(addr, seqstr, java.lang.Integer.MAX_VALUE, 1) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
yield from eas
|
||||
|
||||
|
||||
def get_file_offset(addr: "ghidra.program.model.address.Address") -> int:
|
||||
"""get file offset for an address"""
|
||||
block = get_current_program().getMemory().getBlock(addr)
|
||||
if not block:
|
||||
return -1
|
||||
|
||||
for info in block.getSourceInfos():
|
||||
if info.contains(addr):
|
||||
return info.getFileBytesOffset(addr)
|
||||
|
||||
return -1
|
||||
|
||||
|
||||
def get_bytes(addr: "ghidra.program.model.address.Address", length: int) -> bytes:
|
||||
def get_bytes(addr: ghidra.program.model.address.Address, length: int) -> bytes:
|
||||
"""yield length bytes at addr
|
||||
|
||||
args:
|
||||
@@ -83,12 +57,12 @@ def get_bytes(addr: "ghidra.program.model.address.Address", length: int) -> byte
|
||||
length: length of bytes to pull
|
||||
"""
|
||||
try:
|
||||
return ints_to_bytes(get_flat_api().getBytes(addr, int(length)))
|
||||
except Exception:
|
||||
return ints_to_bytes(getBytes(addr, length)) # type: ignore [name-defined] # noqa: F821
|
||||
except RuntimeError:
|
||||
return b""
|
||||
|
||||
|
||||
def get_block_bytes(block: "ghidra.program.model.mem.MemoryBlock") -> bytes:
|
||||
def get_block_bytes(block: ghidra.program.model.mem.MemoryBlock) -> bytes:
|
||||
"""yield all bytes in a given block
|
||||
|
||||
args:
|
||||
@@ -99,21 +73,20 @@ def get_block_bytes(block: "ghidra.program.model.mem.MemoryBlock") -> bytes:
|
||||
|
||||
def get_function_symbols():
|
||||
"""yield all non-external function symbols"""
|
||||
yield from get_current_program().getFunctionManager().getFunctionsNoStubs(True)
|
||||
yield from currentProgram().getFunctionManager().getFunctionsNoStubs(True) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
|
||||
def get_function_blocks(fh: "capa.features.extractors.base_extractor.FunctionHandle") -> Iterator[BBHandle]:
|
||||
"""
|
||||
yield the basic blocks of the function
|
||||
"""
|
||||
def get_function_blocks(fh: FunctionHandle) -> Iterator[BBHandle]:
|
||||
"""yield BBHandle for each bb in a given function"""
|
||||
|
||||
for block in SimpleBlockIterator(BasicBlockModel(get_current_program()), fh.inner.getBody(), get_monitor()):
|
||||
yield BBHandle(address=AbsoluteVirtualAddress(block.getMinAddress().getOffset()), inner=block)
|
||||
func: ghidra.program.database.function.FunctionDB = fh.inner
|
||||
for bb in SimpleBlockIterator(BasicBlockModel(currentProgram()), func.getBody(), monitor()): # type: ignore [name-defined] # noqa: F821
|
||||
yield BBHandle(address=AbsoluteVirtualAddress(bb.getMinAddress().getOffset()), inner=bb)
|
||||
|
||||
|
||||
def get_insn_in_range(bbh: BBHandle) -> Iterator[InsnHandle]:
|
||||
"""yield InshHandle for each insn in a given basicblock"""
|
||||
for insn in get_current_program().getListing().getInstructions(bbh.inner, True):
|
||||
for insn in currentProgram().getListing().getInstructions(bbh.inner, True): # type: ignore [name-defined] # noqa: F821
|
||||
yield InsnHandle(address=AbsoluteVirtualAddress(insn.getAddress().getOffset()), inner=insn)
|
||||
|
||||
|
||||
@@ -122,7 +95,7 @@ def get_file_imports() -> dict[int, list[str]]:
|
||||
|
||||
import_dict: dict[int, list[str]] = {}
|
||||
|
||||
for f in get_current_program().getFunctionManager().getExternalFunctions():
|
||||
for f in currentProgram().getFunctionManager().getExternalFunctions(): # type: ignore [name-defined] # noqa: F821
|
||||
for r in f.getSymbol().getReferences():
|
||||
if r.getReferenceType().isData():
|
||||
addr = r.getFromAddress().getOffset() # gets pointer to fake external addr
|
||||
@@ -160,7 +133,7 @@ def get_file_externs() -> dict[int, list[str]]:
|
||||
|
||||
extern_dict: dict[int, list[str]] = {}
|
||||
|
||||
for sym in get_current_program().getSymbolTable().getAllSymbols(True):
|
||||
for sym in currentProgram().getSymbolTable().getAllSymbols(True): # type: ignore [name-defined] # noqa: F821
|
||||
# .isExternal() misses more than this config for the function symbols
|
||||
if sym.getSymbolType() == SymbolType.FUNCTION and sym.getSource() == SourceType.ANALYSIS and sym.isGlobal():
|
||||
name = sym.getName() # starts to resolve names based on Ghidra's FidDB
|
||||
@@ -198,7 +171,7 @@ def map_fake_import_addrs() -> dict[int, list[int]]:
|
||||
"""
|
||||
fake_dict: dict[int, list[int]] = {}
|
||||
|
||||
for f in get_current_program().getFunctionManager().getExternalFunctions():
|
||||
for f in currentProgram().getFunctionManager().getExternalFunctions(): # type: ignore [name-defined] # noqa: F821
|
||||
for r in f.getSymbol().getReferences():
|
||||
if r.getReferenceType().isData():
|
||||
fake_dict.setdefault(f.getEntryPoint().getOffset(), []).append(r.getFromAddress().getOffset())
|
||||
@@ -207,7 +180,7 @@ def map_fake_import_addrs() -> dict[int, list[int]]:
|
||||
|
||||
|
||||
def check_addr_for_api(
|
||||
addr: "ghidra.program.model.address.Address",
|
||||
addr: ghidra.program.model.address.Address,
|
||||
fakes: dict[int, list[int]],
|
||||
imports: dict[int, list[str]],
|
||||
externs: dict[int, list[str]],
|
||||
@@ -229,18 +202,18 @@ def check_addr_for_api(
|
||||
return False
|
||||
|
||||
|
||||
def is_call_or_jmp(insn: "ghidra.program.database.code.InstructionDB") -> bool:
|
||||
def is_call_or_jmp(insn: ghidra.program.database.code.InstructionDB) -> bool:
|
||||
return any(mnem in insn.getMnemonicString() for mnem in ["CALL", "J"]) # JMP, JNE, JNZ, etc
|
||||
|
||||
|
||||
def is_sp_modified(insn: "ghidra.program.database.code.InstructionDB") -> bool:
|
||||
def is_sp_modified(insn: ghidra.program.database.code.InstructionDB) -> bool:
|
||||
for i in range(insn.getNumOperands()):
|
||||
if insn.getOperandType(i) == OperandType.REGISTER:
|
||||
return "SP" in insn.getRegister(i).getName() and insn.getOperandRefType(i).isWrite()
|
||||
return False
|
||||
|
||||
|
||||
def is_stack_referenced(insn: "ghidra.program.database.code.InstructionDB") -> bool:
|
||||
def is_stack_referenced(insn: ghidra.program.database.code.InstructionDB) -> bool:
|
||||
"""generic catch-all for stack references"""
|
||||
for i in range(insn.getNumOperands()):
|
||||
if insn.getOperandType(i) == OperandType.REGISTER:
|
||||
@@ -252,7 +225,7 @@ def is_stack_referenced(insn: "ghidra.program.database.code.InstructionDB") -> b
|
||||
return any(ref.isStackReference() for ref in insn.getReferencesFrom())
|
||||
|
||||
|
||||
def is_zxor(insn: "ghidra.program.database.code.InstructionDB") -> bool:
|
||||
def is_zxor(insn: ghidra.program.database.code.InstructionDB) -> bool:
|
||||
# assume XOR insn
|
||||
# XOR's against the same operand zero out
|
||||
ops = []
|
||||
@@ -268,29 +241,29 @@ def is_zxor(insn: "ghidra.program.database.code.InstructionDB") -> bool:
|
||||
return all(n == operands[0] for n in operands)
|
||||
|
||||
|
||||
def handle_thunk(addr: "ghidra.program.model.address.Address"):
|
||||
def handle_thunk(addr: ghidra.program.model.address.Address):
|
||||
"""Follow thunk chains down to a reasonable depth"""
|
||||
ref = addr
|
||||
for _ in range(THUNK_CHAIN_DEPTH_DELTA):
|
||||
thunk_jmp = get_flat_api().getInstructionAt(ref)
|
||||
thunk_jmp = getInstructionAt(ref) # type: ignore [name-defined] # noqa: F821
|
||||
if thunk_jmp and is_call_or_jmp(thunk_jmp):
|
||||
if OperandType.isAddress(thunk_jmp.getOperandType(0)):
|
||||
ref = thunk_jmp.getAddress(0)
|
||||
else:
|
||||
thunk_dat = get_flat_api().getDataContaining(ref)
|
||||
thunk_dat = getDataContaining(ref) # type: ignore [name-defined] # noqa: F821
|
||||
if thunk_dat and thunk_dat.isDefined() and thunk_dat.isPointer():
|
||||
ref = thunk_dat.getValue()
|
||||
break # end of thunk chain reached
|
||||
return ref
|
||||
|
||||
|
||||
def dereference_ptr(insn: "ghidra.program.database.code.InstructionDB"):
|
||||
def dereference_ptr(insn: ghidra.program.database.code.InstructionDB):
|
||||
addr_code = OperandType.ADDRESS | OperandType.CODE
|
||||
to_deref = insn.getAddress(0)
|
||||
dat = get_flat_api().getDataContaining(to_deref)
|
||||
dat = getDataContaining(to_deref) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
if insn.getOperandType(0) == addr_code:
|
||||
thfunc = get_flat_api().getFunctionContaining(to_deref)
|
||||
thfunc = getFunctionContaining(to_deref) # type: ignore [name-defined] # noqa: F821
|
||||
if thfunc and thfunc.isThunk():
|
||||
return handle_thunk(to_deref)
|
||||
else:
|
||||
@@ -321,7 +294,7 @@ def find_data_references_from_insn(insn, max_depth: int = 10):
|
||||
to_addr = reference.getToAddress()
|
||||
|
||||
for _ in range(max_depth - 1):
|
||||
data = get_flat_api().getDataAt(to_addr)
|
||||
data = getDataAt(to_addr) # type: ignore [name-defined] # noqa: F821
|
||||
if data and data.isPointer():
|
||||
ptr_value = data.getValue()
|
||||
|
||||
|
||||
@@ -234,7 +234,7 @@ def extract_insn_bytes_features(fh: FunctionHandle, bb: BBHandle, ih: InsnHandle
|
||||
push offset iid_004118d4_IShellLinkA ; riid
|
||||
"""
|
||||
for addr in capa.features.extractors.ghidra.helpers.find_data_references_from_insn(ih.inner):
|
||||
data = capa.features.extractors.ghidra.helpers.get_flat_api().getDataAt(addr)
|
||||
data = getDataAt(addr) # type: ignore [name-defined] # noqa: F821
|
||||
if data and not data.hasStringValue():
|
||||
extracted_bytes = capa.features.extractors.ghidra.helpers.get_bytes(addr, MAX_BYTES_FEATURE_SIZE)
|
||||
if extracted_bytes and not capa.features.extractors.helpers.all_zeros(extracted_bytes):
|
||||
@@ -249,9 +249,9 @@ def extract_insn_string_features(fh: FunctionHandle, bb: BBHandle, ih: InsnHandl
|
||||
push offset aAcr ; "ACR > "
|
||||
"""
|
||||
for addr in capa.features.extractors.ghidra.helpers.find_data_references_from_insn(ih.inner):
|
||||
data = capa.features.extractors.ghidra.helpers.get_flat_api().getDataAt(addr)
|
||||
data = getDataAt(addr) # type: ignore [name-defined] # noqa: F821
|
||||
if data and data.hasStringValue():
|
||||
yield String(str(data.getValue())), ih.address
|
||||
yield String(data.getValue()), ih.address
|
||||
|
||||
|
||||
def extract_insn_mnemonic_features(
|
||||
@@ -361,8 +361,8 @@ def extract_insn_cross_section_cflow(
|
||||
if capa.features.extractors.ghidra.helpers.check_addr_for_api(ref, fakes, imports, externs):
|
||||
return
|
||||
|
||||
this_mem_block = capa.features.extractors.ghidra.helpers.get_flat_api().getMemoryBlock(insn.getAddress())
|
||||
ref_block = capa.features.extractors.ghidra.helpers.get_flat_api().getMemoryBlock(ref)
|
||||
this_mem_block = getMemoryBlock(insn.getAddress()) # type: ignore [name-defined] # noqa: F821
|
||||
ref_block = getMemoryBlock(ref) # type: ignore [name-defined] # noqa: F821
|
||||
if ref_block != this_mem_block:
|
||||
yield Characteristic("cross section flow"), ih.address
|
||||
|
||||
@@ -425,19 +425,19 @@ def check_nzxor_security_cookie_delta(
|
||||
Check if insn within last addr of last bb - delta
|
||||
"""
|
||||
|
||||
model = SimpleBlockModel(capa.features.extractors.ghidra.helpers.get_current_program())
|
||||
model = SimpleBlockModel(currentProgram()) # type: ignore [name-defined] # noqa: F821
|
||||
insn_addr = insn.getAddress()
|
||||
func_asv = fh.getBody()
|
||||
|
||||
first_addr = func_asv.getMinAddress()
|
||||
if insn_addr < first_addr.add(SECURITY_COOKIE_BYTES_DELTA):
|
||||
first_bb = model.getFirstCodeBlockContaining(first_addr, capa.features.extractors.ghidra.helpers.get_monitor())
|
||||
first_bb = model.getFirstCodeBlockContaining(first_addr, monitor()) # type: ignore [name-defined] # noqa: F821
|
||||
if first_bb.contains(insn_addr):
|
||||
return True
|
||||
|
||||
last_addr = func_asv.getMaxAddress()
|
||||
if insn_addr > last_addr.add(SECURITY_COOKIE_BYTES_DELTA * -1):
|
||||
last_bb = model.getFirstCodeBlockContaining(last_addr, capa.features.extractors.ghidra.helpers.get_monitor())
|
||||
last_bb = model.getFirstCodeBlockContaining(last_addr, monitor()) # type: ignore [name-defined] # noqa: F821
|
||||
if last_bb.contains(insn_addr):
|
||||
return True
|
||||
|
||||
@@ -488,3 +488,22 @@ INSTRUCTION_HANDLERS = (
|
||||
extract_function_calls_from,
|
||||
extract_function_indirect_call_characteristic_features,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
""" """
|
||||
features = []
|
||||
from capa.features.extractors.ghidra.extractor import GhidraFeatureExtractor
|
||||
|
||||
for fh in GhidraFeatureExtractor().get_functions():
|
||||
for bb in capa.features.extractors.ghidra.helpers.get_function_blocks(fh):
|
||||
for insn in capa.features.extractors.ghidra.helpers.get_insn_in_range(bb):
|
||||
features.extend(list(extract_features(fh, bb, insn)))
|
||||
|
||||
import pprint
|
||||
|
||||
pprint.pprint(features) # noqa: T203
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -18,7 +18,6 @@ import idaapi
|
||||
import idautils
|
||||
|
||||
import capa.features.extractors.ida.helpers
|
||||
from capa.features.file import FunctionName
|
||||
from capa.features.common import Feature, Characteristic
|
||||
from capa.features.address import Address, AbsoluteVirtualAddress
|
||||
from capa.features.extractors import loops
|
||||
@@ -51,39 +50,10 @@ def extract_recursive_call(fh: FunctionHandle):
|
||||
yield Characteristic("recursive call"), fh.address
|
||||
|
||||
|
||||
def extract_function_name(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
ea = fh.inner.start_ea
|
||||
name = idaapi.get_name(ea)
|
||||
if name.startswith("sub_"):
|
||||
# skip default names, like "sub_401000"
|
||||
return
|
||||
|
||||
yield FunctionName(name), fh.address
|
||||
if name.startswith("_"):
|
||||
# some linkers may prefix linked routines with a `_` to avoid name collisions.
|
||||
# extract features for both the mangled and un-mangled representations.
|
||||
# e.g. `_fwrite` -> `fwrite`
|
||||
# see: https://stackoverflow.com/a/2628384/87207
|
||||
yield FunctionName(name[1:]), fh.address
|
||||
|
||||
|
||||
def extract_function_alternative_names(fh: FunctionHandle):
|
||||
"""Get all alternative names for an address."""
|
||||
|
||||
for aname in capa.features.extractors.ida.helpers.get_function_alternative_names(fh.inner.start_ea):
|
||||
yield FunctionName(aname), fh.address
|
||||
|
||||
|
||||
def extract_features(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
for func_handler in FUNCTION_HANDLERS:
|
||||
for feature, addr in func_handler(fh):
|
||||
yield feature, addr
|
||||
|
||||
|
||||
FUNCTION_HANDLERS = (
|
||||
extract_function_calls_to,
|
||||
extract_function_loop,
|
||||
extract_recursive_call,
|
||||
extract_function_name,
|
||||
extract_function_alternative_names,
|
||||
)
|
||||
FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call)
|
||||
|
||||
@@ -20,7 +20,6 @@ import idaapi
|
||||
import ida_nalt
|
||||
import idautils
|
||||
import ida_bytes
|
||||
import ida_funcs
|
||||
import ida_segment
|
||||
|
||||
from capa.features.address import AbsoluteVirtualAddress
|
||||
@@ -437,16 +436,3 @@ def is_basic_block_return(bb: idaapi.BasicBlock) -> bool:
|
||||
def has_sib(oper: idaapi.op_t) -> bool:
|
||||
# via: https://reverseengineering.stackexchange.com/a/14300
|
||||
return oper.specflag1 == 1
|
||||
|
||||
|
||||
def find_alternative_names(cmt: str):
|
||||
for line in cmt.split("\n"):
|
||||
if line.startswith("Alternative name is '") and line.endswith("'"):
|
||||
name = line[len("Alternative name is '") : -1] # Extract name between quotes
|
||||
yield name
|
||||
|
||||
|
||||
def get_function_alternative_names(fva: int):
|
||||
"""Get all alternative names for an address."""
|
||||
yield from find_alternative_names(ida_bytes.get_cmt(fva, False) or "")
|
||||
yield from find_alternative_names(ida_funcs.get_func_cmt(idaapi.get_func(fva), False) or "")
|
||||
|
||||
@@ -22,7 +22,6 @@ import idautils
|
||||
|
||||
import capa.features.extractors.helpers
|
||||
import capa.features.extractors.ida.helpers
|
||||
from capa.features.file import FunctionName
|
||||
from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
|
||||
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic
|
||||
from capa.features.address import Address, AbsoluteVirtualAddress
|
||||
@@ -130,8 +129,8 @@ def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle)
|
||||
# not a function (start)
|
||||
return
|
||||
|
||||
name = idaapi.get_name(target_func.start_ea)
|
||||
if target_func.flags & idaapi.FUNC_LIB or not name.startswith("sub_"):
|
||||
if target_func.flags & idaapi.FUNC_LIB:
|
||||
name = idaapi.get_name(target_func.start_ea)
|
||||
yield API(name), ih.address
|
||||
if name.startswith("_"):
|
||||
# some linkers may prefix linked routines with a `_` to avoid name collisions.
|
||||
@@ -140,10 +139,6 @@ def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle)
|
||||
# see: https://stackoverflow.com/a/2628384/87207
|
||||
yield API(name[1:]), ih.address
|
||||
|
||||
for altname in capa.features.extractors.ida.helpers.get_function_alternative_names(target_func.start_ea):
|
||||
yield FunctionName(altname), ih.address
|
||||
yield API(altname), ih.address
|
||||
|
||||
|
||||
def extract_insn_number_features(
|
||||
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
||||
|
||||
@@ -56,7 +56,7 @@ def get_previous_instructions(vw: VivWorkspace, va: int) -> list[int]:
|
||||
if ploc is not None:
|
||||
# from vivisect.const:
|
||||
# location: (L_VA, L_SIZE, L_LTYPE, L_TINFO)
|
||||
pva, _, ptype, pinfo = ploc
|
||||
(pva, _, ptype, pinfo) = ploc
|
||||
|
||||
if ptype == LOC_OP and not (pinfo & IF_NOFALL):
|
||||
ret.append(pva)
|
||||
|
||||
@@ -176,7 +176,7 @@ def extract_insn_api_features(fh: FunctionHandle, bb, ih: InsnHandle) -> Iterato
|
||||
|
||||
elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper):
|
||||
try:
|
||||
_, target = resolve_indirect_call(f.vw, insn.va, insn=insn)
|
||||
(_, target) = resolve_indirect_call(f.vw, insn.va, insn=insn)
|
||||
except NotFoundError:
|
||||
# not able to resolve the indirect call, sorry
|
||||
return
|
||||
|
||||
@@ -96,7 +96,14 @@ class VMRayAnalysis:
|
||||
% (self.submission_name, self.submission_type)
|
||||
)
|
||||
|
||||
if self.submission_static is None:
|
||||
if self.submission_static is not None:
|
||||
if self.submission_static.pe is None and self.submission_static.elf is None:
|
||||
# we only support static analysis for PE and ELF files for now
|
||||
raise UnsupportedFormatError(
|
||||
"archive does not contain a supported file format (submission_name: %s, submission_type: %s)"
|
||||
% (self.submission_name, self.submission_type)
|
||||
)
|
||||
else:
|
||||
# VMRay may not record static analysis for certain file types, e.g. MSI, but we'd still like to match dynamic
|
||||
# execution so we continue without and accept that the results may be incomplete
|
||||
logger.warning(
|
||||
|
||||
@@ -26,16 +26,6 @@ from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, Pr
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
VOID_PTR_NUMBER_PARAMS = frozenset(
|
||||
{
|
||||
"hKey",
|
||||
"hKeyRoot",
|
||||
"hkResult",
|
||||
"samDesired",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def get_call_param_features(param: Param, ch: CallHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
if param.deref is not None:
|
||||
# pointer types contain a special "deref" member that stores the deref'd value
|
||||
@@ -49,31 +39,10 @@ def get_call_param_features(param: Param, ch: CallHandle) -> Iterator[tuple[Feat
|
||||
# parsing the data up to here results in double-escaped backslashes, remove those here
|
||||
yield String(param.deref.value.replace("\\\\", "\\")), ch.address
|
||||
else:
|
||||
if param.name in VOID_PTR_NUMBER_PARAMS:
|
||||
try:
|
||||
yield Number(hexint(param.deref.value)), ch.address
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.debug(
|
||||
"failed to parse whitelisted void_ptr param %s value %s: %s",
|
||||
param.name,
|
||||
param.deref.value,
|
||||
e,
|
||||
)
|
||||
else:
|
||||
logger.debug("skipping deref param type %s", param.deref.type_)
|
||||
logger.debug("skipping deref param type %s", param.deref.type_)
|
||||
elif param.value is not None:
|
||||
if param.type_ in PARAM_TYPE_INT:
|
||||
yield Number(hexint(param.value)), ch.address
|
||||
elif param.type_ == "void_ptr" and param.name in VOID_PTR_NUMBER_PARAMS:
|
||||
try:
|
||||
yield Number(hexint(param.value)), ch.address
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.debug(
|
||||
"failed to parse whitelisted void_ptr param %s value %s: %s",
|
||||
param.name,
|
||||
param.value,
|
||||
e,
|
||||
)
|
||||
|
||||
|
||||
def extract_call_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
|
||||
from typing import Iterator
|
||||
from pathlib import Path
|
||||
|
||||
@@ -39,8 +39,6 @@ from capa.features.extractors.base_extractor import (
|
||||
DynamicFeatureExtractor,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_formatted_params(params: ParamList) -> list[str]:
|
||||
params_list: list[str] = []
|
||||
@@ -89,16 +87,6 @@ class VMRayExtractor(DynamicFeatureExtractor):
|
||||
|
||||
def get_processes(self) -> Iterator[ProcessHandle]:
|
||||
for monitor_process in self.analysis.monitor_processes.values():
|
||||
# skip invalid/incomplete monitor process entries, see #2807
|
||||
if monitor_process.pid == 0 or not monitor_process.filename:
|
||||
logger.debug(
|
||||
"skipping incomplete process entry: pid=%d, filename=%s, monitor_id=%d",
|
||||
monitor_process.pid,
|
||||
monitor_process.filename,
|
||||
monitor_process.monitor_id,
|
||||
)
|
||||
continue
|
||||
|
||||
address: ProcessAddress = ProcessAddress(pid=monitor_process.pid, ppid=monitor_process.ppid)
|
||||
yield ProcessHandle(address, inner=monitor_process)
|
||||
|
||||
|
||||
@@ -1,75 +1,107 @@
|
||||
# capa analysis using Ghidra
|
||||
<div align="center">
|
||||
<img src="../../doc/img/ghidra_backend_logo.png" width=240 height=125>
|
||||
</div>
|
||||
|
||||
capa supports using Ghidra (via [PyGhidra](https://github.com/NationalSecurityAgency/ghidra/tree/master/Ghidra/Features/PyGhidra)) as a feature extraction backend. This enables you to run capa against binaries using Ghidra's analysis engine.
|
||||
# capa + Ghidra
|
||||
|
||||
[capa](https://github.com/mandiant/capa) is the FLARE team’s open-source tool that detects capabilities in executable files. [Ghidra](https://github.com/NationalSecurityAgency/ghidra) is an open-source software reverse engineering framework created and maintained by the National Security Agency Research Directorate. capa + Ghidra brings capa’s detection capabilities directly to Ghidra’s user interface helping speed up your reverse engineering tasks by identifying what parts of a program suggest interesting behavior, such as setting a registry value. You can execute the included Python 3 scripts [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) or [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) to run capa’s analysis and view the results in Ghidra. You may be asking yourself, “Python 3 scripts in Ghidra?”. You read that correctly. This integration is written entirely in Python 3 and relies on [Ghidrathon]( https://github.com/mandiant/ghidrathon), an open source Ghidra extension that adds Python 3 scripting to Ghidra.
|
||||
|
||||
Check out our capa + Ghidra blog posts:
|
||||
* [Riding Dragons: capa Harnesses Ghidra](https://www.mandiant.com/resources/blog/capa-harnesses-ghidra)
|
||||
|
||||
## UI Integration
|
||||
[capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) renders capa results in Ghidra's UI to help you quickly navigate them. This includes adding matched functions to Ghidra’s Symbol Tree and Bookmarks windows and adding comments to functions that indicate matched capabilities and features. You can execute this script using Ghidra’s Script Manager window.
|
||||
|
||||
### Symbol Tree Window
|
||||
Matched functions are added to Ghidra's Symbol Tree window under a custom namespace that maps to the capabilities' [capa namespace](https://github.com/mandiant/capa-rules/blob/master/doc/format.md#rule-namespace).
|
||||
<div align="center">
|
||||
<img src="https://github.com/mandiant/capa/assets/66766340/eeae33f4-99d4-42dc-a5e8-4c1b8c661492" width=300>
|
||||
</div>
|
||||
|
||||
### Comments
|
||||
|
||||
Comments are added at the beginning of matched functions indicating matched capabilities and inline comments are added to functions indicating matched features. You can view these comments in Ghidra’s Disassembly Listing and Decompile windows.
|
||||
<div align="center">
|
||||
<img src="https://github.com/mandiant/capa/assets/66766340/bb2b4170-7fd4-45fc-8c7b-ff8f2e2f101b" width=1000>
|
||||
</div>
|
||||
|
||||
### Bookmarks
|
||||
|
||||
Bookmarks are added to functions that matched a capability that is mapped to a MITRE ATT&CK and/or Malware Behavior Catalog (MBC) technique. You can view these bookmarks in Ghidra's Bookmarks window.
|
||||
<div align="center">
|
||||
<img src="https://github.com/mandiant/capa/assets/66766340/7f9a66a9-7be7-4223-91c6-4b8fc4651336" width=825>
|
||||
</div>
|
||||
|
||||
## Text-based Integration
|
||||
|
||||
[capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) outputs text-based capa results that mirror the output of capa’s standalone tool. You can execute this script using Ghidra’s Script Manager and view its output in Ghidra’s Console window.
|
||||
|
||||
<div align="center">
|
||||
<img src="../../doc/img/ghidra_script_mngr_output.png" width=700>
|
||||
</div>
|
||||
|
||||
You can also execute [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using Ghidra's Headless Analyzer to view its output in a terminal window.
|
||||
|
||||
<div align="center">
|
||||
<img src="../../doc/img/ghidra_headless_analyzer.png">
|
||||
</div>
|
||||
|
||||
# Getting Started
|
||||
|
||||
## Requirements
|
||||
|
||||
| Tool | Version | Source |
|
||||
|------------|---------|--------|
|
||||
| capa | `>= 7.0.0` | https://github.com/mandiant/capa/releases |
|
||||
| Ghidrathon | `>= 3.0.0` | https://github.com/mandiant/Ghidrathon/releases |
|
||||
| Ghidra | `>= 10.3.2` | https://github.com/NationalSecurityAgency/ghidra/releases |
|
||||
| Python | `>= 3.10.0` | https://www.python.org/downloads |
|
||||
|
||||
## Installation
|
||||
|
||||
**Note**: capa + Ghidra relies on [Ghidrathon]( https://github.com/mandiant/ghidrathon) to execute Python 3 code in Ghidra. You must first install and configure Ghidrathon using the [steps outlined in its README]( https://github.com/mandiant/ghidrathon?tab=readme-ov-file#installing-ghidrathon). Then, you must use the Python 3 interpreter that you configured with Ghidrathon to complete the following steps:
|
||||
|
||||
1. Install capa and its dependencies from PyPI using the following command:
|
||||
```bash
|
||||
$ capa -b ghidra Practical\ Malware\ Analysis\ Lab\ 01-01.exe_
|
||||
┌──────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ md5 │ bb7425b82141a1c0f7d60e5106676bb1 │
|
||||
│ sha1 │ │
|
||||
│ sha256 │ 58898bd42c5bd3bf9b1389f0eee5b39cd59180e8370eb9ea838a0b327bd6fe47 │
|
||||
│ analysis │ static │
|
||||
│ os │ windows │
|
||||
│ format │ pe │
|
||||
│ arch │ i386 │
|
||||
│ path │ ~/Documents/capa/tests/data/Practical Malware Analysis Lab 01-01.exe_ │
|
||||
└──────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────┘
|
||||
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ ATT&CK Tactic ┃ ATT&CK Technique ┃
|
||||
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ DISCOVERY │ File and Directory Discovery [T1083] │
|
||||
└────────────────────────────────────┴─────────────────────────────────────────────────────────────┘
|
||||
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ MBC Objective ┃ MBC Behavior ┃
|
||||
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ DISCOVERY │ File and Directory Discovery [E1083] │
|
||||
│ FILE SYSTEM │ Copy File [C0045] │
|
||||
│ │ Read File [C0051] │
|
||||
│ PROCESS │ Terminate Process [C0018] │
|
||||
└────────────────────────────────────┴─────────────────────────────────────────────────────────────┘
|
||||
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ Capability ┃ Namespace ┃
|
||||
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ copy file │ host-interaction/file-system/copy │
|
||||
│ enumerate files recursively │ host-interaction/file-system/files/list │
|
||||
│ read file via mapping (2 matches) │ host-interaction/file-system/read │
|
||||
│ terminate process (2 matches) │ host-interaction/process/terminate │
|
||||
│ resolve function by parsing PE exports │ load-code/pe │
|
||||
└────────────────────────────────────────────────┴─────────────────────────────────────────────────┘
|
||||
$ pip install flare-capa
|
||||
```
|
||||
|
||||
## getting started
|
||||
|
||||
### requirements
|
||||
|
||||
- [Ghidra](https://github.com/NationalSecurityAgency/ghidra) >= 12.0 must be installed and available via the `GHIDRA_INSTALL_DIR` environment variable.
|
||||
|
||||
#### standalone binary (recommended)
|
||||
|
||||
The capa [standalone binary](https://github.com/mandiant/capa/releases) is the preferred way to run capa with the Ghidra backend.
|
||||
Although the binary does not bundle the Java environment or Ghidra itself, it will dynamically load them at runtime.
|
||||
|
||||
#### python package
|
||||
|
||||
You can also use the Ghidra backend with the capa Python package by installing `flare-capa` with the `ghidra` extra.
|
||||
|
||||
2. Download and extract the [official capa rules](https://github.com/mandiant/capa-rules/releases) that match the capa version you have installed. You can use the following command to view the version of capa you have installed:
|
||||
```bash
|
||||
$ pip install "flare-capa[ghidra]"
|
||||
$ pip show flare-capa
|
||||
OR
|
||||
$ capa --version
|
||||
```
|
||||
|
||||
### usage
|
||||
3. Copy [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) to your `ghidra_scripts` directory or manually add the parent directory of each script using Ghidra’s Script Manager.
|
||||
|
||||
To use the Ghidra backend, specify it with the `-b` or `--backend` flag:
|
||||
## Usage
|
||||
|
||||
You can execute [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using Ghidra’s Script Manager. [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) can also be executed using Ghidra's Headless Analyzer.
|
||||
|
||||
### Execution using Ghidra’s Script Manager
|
||||
|
||||
You can execute [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) and [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using Ghidra's Script Manager as follows:
|
||||
1. Navigate to `Window > Script Manager`
|
||||
2. Expand the `Python 3 > capa` category
|
||||
3. Double-click a script to execute it
|
||||
|
||||
Both scripts ask you to provide the path of your capa rules directory (see installation step 2). [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) also has you choose one of `default`, `verbose`, and `vverbose` output formats which mirror the output formats of capa’s standalone tool.
|
||||
|
||||
### Execution using Ghidra’s Headless Analyzer
|
||||
|
||||
You can execute [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) using Ghidra’s Headless Analyzer by invoking the `analyzeHeadless` script included with Ghidra in its `support` directory. The following arguments must be provided:
|
||||
|
||||
| Argument | Description |
|
||||
|----|----|
|
||||
|`<project_path>`| Path to Ghidra project|
|
||||
| `<project_name>`| Name of Ghidra Project|
|
||||
| `-Process <sample_name>` OR `-Import <sample_path>`| Name of sample `<sample_name>` already imported into `<project_name>` OR absolute path of sample `<sample_path>` to import into `<project_name>`|
|
||||
| `-ScriptPath <script_path>`| OPTIONAL parent directory `<script_path>` of [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py)|
|
||||
| `-PostScript capa_ghidra.py`| Execute [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) after Ghidra analysis|
|
||||
| `"<script_args>"`| Quoted string `"<script_args>"` containing script arguments passed to [capa_ghidra.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_ghidra.py) that must specify a capa rules path and optionally the output format (`--verbose`, `--vverbose`, `--json`) – you can specify `”help”` to view the script’s help message |
|
||||
|
||||
The following is an example of combining these arguments into a single `analyzeHeadless` script command:
|
||||
```bash
|
||||
$ capa -b ghidra /path/to/sample
|
||||
$ analyzeHeadless /home/wumbo/demo demo -Import /home/wumbo/capa/tests/data/Practical\ Malware\ Analysis\ Lab\ 01-01.dll_ -PostScript capa_ghidra.py "/home/wumbo/capa/rules --verbose"
|
||||
```
|
||||
|
||||
capa will:
|
||||
1. Initialize a headless Ghidra instance.
|
||||
2. Create a temporary project.
|
||||
3. Import and analyze the sample.
|
||||
4. Extract features and match rules.
|
||||
5. Clean up the temporary project.
|
||||
|
||||
**Note:** The first time you run this, it may take a few moments to initialize the Ghidra environment.
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# Run capa against loaded Ghidra database and render results in Ghidra UI
|
||||
# @author Colton Gabertan (gabertan.colton@gmail.com)
|
||||
# @category Python 3.capa
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@@ -12,63 +16,36 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Run capa against loaded Ghidra database and render results in Ghidra UI
|
||||
|
||||
# @author Colton Gabertan (gabertan.colton@gmail.com)
|
||||
# @category capa
|
||||
# @runtime PyGhidra
|
||||
|
||||
import sys
|
||||
import json
|
||||
import logging
|
||||
import pathlib
|
||||
from typing import Any
|
||||
|
||||
from java.util import ArrayList
|
||||
from ghidra.util import Msg
|
||||
from ghidra.app.cmd.label import AddLabelCmd, CreateNamespacesCmd
|
||||
from ghidra.util.exception import CancelledException
|
||||
from ghidra.program.flatapi import FlatProgramAPI
|
||||
from ghidra.program.model.symbol import Namespace, SourceType, SymbolType
|
||||
|
||||
import capa
|
||||
import capa.main
|
||||
import capa.rules
|
||||
import capa.version
|
||||
import capa.render.json
|
||||
import capa.ghidra.helpers
|
||||
import capa.capabilities.common
|
||||
import capa.features.extractors.ghidra.context
|
||||
import capa.features.extractors.ghidra.extractor
|
||||
|
||||
logger = logging.getLogger("capa_explorer")
|
||||
|
||||
|
||||
def show_monitor_message(msg):
|
||||
capa.ghidra.helpers.get_monitor().checkCanceled()
|
||||
capa.ghidra.helpers.get_monitor().setMessage(msg)
|
||||
|
||||
|
||||
def show_error(msg):
|
||||
Msg.showError(None, None, "capa explorer", msg)
|
||||
|
||||
|
||||
def show_warn(msg):
|
||||
Msg.showWarn(None, None, "capa explorer", msg)
|
||||
|
||||
|
||||
def show_info(msg):
|
||||
Msg.showInfo(None, None, "capa explorer", msg)
|
||||
|
||||
|
||||
def add_bookmark(addr, txt, category="CapaExplorer"):
|
||||
"""create bookmark at addr"""
|
||||
capa.ghidra.helpers.get_current_program().getBookmarkManager().setBookmark(addr, "Info", category, txt)
|
||||
currentProgram().getBookmarkManager().setBookmark(addr, "Info", category, txt) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
|
||||
def create_namespace(namespace_str):
|
||||
"""create new Ghidra namespace for each capa namespace"""
|
||||
|
||||
cmd = CreateNamespacesCmd(namespace_str, SourceType.USER_DEFINED)
|
||||
cmd.applyTo(capa.ghidra.helpers.get_current_program())
|
||||
cmd.applyTo(currentProgram()) # type: ignore [name-defined] # noqa: F821
|
||||
return cmd.getNamespace()
|
||||
|
||||
|
||||
@@ -76,7 +53,7 @@ def create_label(ghidra_addr, name, capa_namespace):
|
||||
"""custom label cmd to overlay symbols under capa-generated namespaces"""
|
||||
|
||||
# prevent duplicate labels under the same capa-generated namespace
|
||||
symbol_table = capa.ghidra.helpers.get_current_program().getSymbolTable()
|
||||
symbol_table = currentProgram().getSymbolTable() # type: ignore [name-defined] # noqa: F821
|
||||
for sym in symbol_table.getSymbols(ghidra_addr):
|
||||
if sym.getName(True) == capa_namespace.getName(True) + Namespace.DELIMITER + name:
|
||||
return
|
||||
@@ -84,7 +61,7 @@ def create_label(ghidra_addr, name, capa_namespace):
|
||||
# create SymbolType.LABEL at addr
|
||||
# prioritize capa-generated namespace (duplicate match @ new addr), else put under global Ghidra one (new match)
|
||||
cmd = AddLabelCmd(ghidra_addr, name, True, SourceType.USER_DEFINED)
|
||||
cmd.applyTo(capa.ghidra.helpers.get_current_program())
|
||||
cmd.applyTo(currentProgram()) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
# assign new match overlay label to capa-generated namespace
|
||||
cmd.getSymbol().setNamespace(capa_namespace)
|
||||
@@ -115,8 +92,8 @@ class CapaMatchData:
|
||||
return
|
||||
|
||||
for key in self.matches.keys():
|
||||
addr = capa.ghidra.helpers.get_flat_api().toAddr(hex(key))
|
||||
func = capa.ghidra.helpers.get_flat_api().getFunctionContaining(addr)
|
||||
addr = toAddr(hex(key)) # type: ignore [name-defined] # noqa: F821
|
||||
func = getFunctionContaining(addr) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
# bookmark & tag MITRE ATT&CK tactics & MBC @ function scope
|
||||
if func is not None:
|
||||
@@ -140,160 +117,140 @@ class CapaMatchData:
|
||||
|
||||
def set_plate_comment(self, ghidra_addr):
|
||||
"""set plate comments at matched functions"""
|
||||
comment = capa.ghidra.helpers.get_flat_api().getPlateComment(ghidra_addr)
|
||||
comment = getPlateComment(ghidra_addr) # type: ignore [name-defined] # noqa: F821
|
||||
rule_path = self.namespace.replace(Namespace.DELIMITER, "/")
|
||||
# 2 calls to avoid duplicate comments via subsequent script runs
|
||||
if comment is None:
|
||||
# first comment @ function
|
||||
comment = rule_path + "\n"
|
||||
capa.ghidra.helpers.get_flat_api().setPlateComment(ghidra_addr, comment)
|
||||
setPlateComment(ghidra_addr, comment) # type: ignore [name-defined] # noqa: F821
|
||||
elif rule_path not in comment:
|
||||
comment = comment + rule_path + "\n"
|
||||
capa.ghidra.helpers.get_flat_api().setPlateComment(ghidra_addr, comment)
|
||||
setPlateComment(ghidra_addr, comment) # type: ignore [name-defined] # noqa: F821
|
||||
else:
|
||||
return
|
||||
|
||||
def set_pre_comment(self, ghidra_addr, sub_type, description):
|
||||
"""set pre comments at subscoped matches of main rules"""
|
||||
comment = capa.ghidra.helpers.get_flat_api().getPreComment(ghidra_addr)
|
||||
comment = getPreComment(ghidra_addr) # type: ignore [name-defined] # noqa: F821
|
||||
if comment is None:
|
||||
comment = "capa: " + sub_type + "(" + description + ")" + ' matched in "' + self.capability + '"\n'
|
||||
capa.ghidra.helpers.get_flat_api().setPreComment(ghidra_addr, comment)
|
||||
setPreComment(ghidra_addr, comment) # type: ignore [name-defined] # noqa: F821
|
||||
elif self.capability not in comment:
|
||||
comment = (
|
||||
comment + "capa: " + sub_type + "(" + description + ")" + ' matched in "' + self.capability + '"\n'
|
||||
)
|
||||
capa.ghidra.helpers.get_flat_api().setPreComment(ghidra_addr, comment)
|
||||
setPreComment(ghidra_addr, comment) # type: ignore [name-defined] # noqa: F821
|
||||
else:
|
||||
return
|
||||
|
||||
def label_matches(self, do_namespaces, do_comments):
|
||||
def label_matches(self):
|
||||
"""label findings at function scopes and comment on subscope matches"""
|
||||
capa_namespace = None
|
||||
if do_namespaces:
|
||||
capa_namespace = create_namespace(self.namespace)
|
||||
|
||||
symbol_table = capa.ghidra.helpers.get_current_program().getSymbolTable()
|
||||
capa_namespace = create_namespace(self.namespace)
|
||||
symbol_table = currentProgram().getSymbolTable() # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
# handle function main scope of matched rule
|
||||
# these will typically contain further matches within
|
||||
if self.scope == "function":
|
||||
for addr in self.matches.keys():
|
||||
ghidra_addr = capa.ghidra.helpers.get_flat_api().toAddr(hex(addr))
|
||||
ghidra_addr = toAddr(hex(addr)) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
# classify new function label under capa-generated namespace
|
||||
if do_namespaces:
|
||||
sym = symbol_table.getPrimarySymbol(ghidra_addr)
|
||||
if sym is not None:
|
||||
if sym.getSymbolType() == SymbolType.FUNCTION:
|
||||
create_label(ghidra_addr, sym.getName(), capa_namespace)
|
||||
sym = symbol_table.getPrimarySymbol(ghidra_addr)
|
||||
if sym is not None:
|
||||
if sym.getSymbolType() == SymbolType.FUNCTION:
|
||||
create_label(ghidra_addr, sym.getName(), capa_namespace)
|
||||
self.set_plate_comment(ghidra_addr)
|
||||
|
||||
if do_comments:
|
||||
self.set_plate_comment(ghidra_addr)
|
||||
# parse the corresponding nodes, and pre-comment subscope matched features
|
||||
# under the encompassing function(s)
|
||||
for sub_match in self.matches.get(addr):
|
||||
for loc, node in sub_match.items():
|
||||
sub_ghidra_addr = toAddr(hex(loc)) # type: ignore [name-defined] # noqa: F821
|
||||
if sub_ghidra_addr == ghidra_addr:
|
||||
# skip duplicates
|
||||
continue
|
||||
|
||||
# parse the corresponding nodes, and pre-comment subscope matched features
|
||||
# under the encompassing function(s)
|
||||
for sub_match in self.matches.get(addr):
|
||||
for loc, node in sub_match.items():
|
||||
sub_ghidra_addr = capa.ghidra.helpers.get_flat_api().toAddr(hex(loc))
|
||||
if sub_ghidra_addr == ghidra_addr:
|
||||
# skip duplicates
|
||||
continue
|
||||
|
||||
# precomment subscope matches under the function
|
||||
if node != {} and do_comments:
|
||||
for sub_type, description in parse_node(node):
|
||||
self.set_pre_comment(sub_ghidra_addr, sub_type, description)
|
||||
# precomment subscope matches under the function
|
||||
if node != {}:
|
||||
for sub_type, description in parse_node(node):
|
||||
self.set_pre_comment(sub_ghidra_addr, sub_type, description)
|
||||
else:
|
||||
# resolve the encompassing function for the capa namespace
|
||||
# of non-function scoped main matches
|
||||
for addr in self.matches.keys():
|
||||
ghidra_addr = capa.ghidra.helpers.get_flat_api().toAddr(hex(addr))
|
||||
ghidra_addr = toAddr(hex(addr)) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
# basic block / insn scoped main matches
|
||||
# Ex. See "Create Process on Windows" Rule
|
||||
func = capa.ghidra.helpers.get_flat_api().getFunctionContaining(ghidra_addr)
|
||||
func = getFunctionContaining(ghidra_addr) # type: ignore [name-defined] # noqa: F821
|
||||
if func is not None:
|
||||
func_addr = func.getEntryPoint()
|
||||
if do_namespaces:
|
||||
create_label(func_addr, func.getName(), capa_namespace)
|
||||
if do_comments:
|
||||
self.set_plate_comment(func_addr)
|
||||
create_label(func_addr, func.getName(), capa_namespace)
|
||||
self.set_plate_comment(func_addr)
|
||||
|
||||
# create subscope match precomments
|
||||
for sub_match in self.matches.get(addr):
|
||||
for loc, node in sub_match.items():
|
||||
sub_ghidra_addr = capa.ghidra.helpers.get_flat_api().toAddr(hex(loc))
|
||||
sub_ghidra_addr = toAddr(hex(loc)) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
if node != {}:
|
||||
if func is not None:
|
||||
# basic block/ insn scope under resolved function
|
||||
if do_comments:
|
||||
for sub_type, description in parse_node(node):
|
||||
self.set_pre_comment(sub_ghidra_addr, sub_type, description)
|
||||
for sub_type, description in parse_node(node):
|
||||
self.set_pre_comment(sub_ghidra_addr, sub_type, description)
|
||||
else:
|
||||
# this would be a global/file scoped main match
|
||||
# try to resolve the encompassing function via the subscope match, instead
|
||||
# Ex. "run as service" rule
|
||||
sub_func = capa.ghidra.helpers.get_flat_api().getFunctionContaining(sub_ghidra_addr)
|
||||
sub_func = getFunctionContaining(sub_ghidra_addr) # type: ignore [name-defined] # noqa: F821
|
||||
if sub_func is not None:
|
||||
sub_func_addr = sub_func.getEntryPoint()
|
||||
# place function in capa namespace & create the subscope match label in Ghidra's global namespace
|
||||
if do_namespaces:
|
||||
create_label(sub_func_addr, sub_func.getName(), capa_namespace)
|
||||
if do_comments:
|
||||
self.set_plate_comment(sub_func_addr)
|
||||
|
||||
if do_comments:
|
||||
for sub_type, description in parse_node(node):
|
||||
self.set_pre_comment(sub_ghidra_addr, sub_type, description)
|
||||
create_label(sub_func_addr, sub_func.getName(), capa_namespace)
|
||||
self.set_plate_comment(sub_func_addr)
|
||||
for sub_type, description in parse_node(node):
|
||||
self.set_pre_comment(sub_ghidra_addr, sub_type, description)
|
||||
else:
|
||||
# addr is in some other file section like .data
|
||||
# represent this location with a label symbol under the capa namespace
|
||||
# Ex. See "Reference Base64 String" rule
|
||||
if do_namespaces:
|
||||
for _sub_type, _description in parse_node(node):
|
||||
# in many cases, these will be ghidra-labeled data, so just add the existing
|
||||
# label symbol to the capa namespace
|
||||
for sym in symbol_table.getSymbols(sub_ghidra_addr):
|
||||
if sym.getSymbolType() == SymbolType.LABEL:
|
||||
sym.setNamespace(capa_namespace)
|
||||
if do_comments:
|
||||
for sub_type, description in parse_node(node):
|
||||
self.set_pre_comment(sub_ghidra_addr, sub_type, description)
|
||||
for sub_type, description in parse_node(node):
|
||||
# in many cases, these will be ghidra-labeled data, so just add the existing
|
||||
# label symbol to the capa namespace
|
||||
for sym in symbol_table.getSymbols(sub_ghidra_addr):
|
||||
if sym.getSymbolType() == SymbolType.LABEL:
|
||||
sym.setNamespace(capa_namespace)
|
||||
self.set_pre_comment(sub_ghidra_addr, sub_type, description)
|
||||
|
||||
|
||||
def get_capabilities():
|
||||
rules_dir = ""
|
||||
|
||||
show_monitor_message(f"requesting capa {capa.version.__version__} rules directory")
|
||||
selected_dir = askDirectory(f"choose capa {capa.version.__version__} rules directory", "Ok") # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
if selected_dir:
|
||||
rules_dir = selected_dir.getPath()
|
||||
rules_dir: str = ""
|
||||
try:
|
||||
selected_dir = askDirectory("Choose capa rules directory", "Ok") # type: ignore [name-defined] # noqa: F821
|
||||
if selected_dir:
|
||||
rules_dir = selected_dir.getPath()
|
||||
except RuntimeError:
|
||||
# RuntimeError thrown when user selects "Cancel"
|
||||
pass
|
||||
|
||||
if not rules_dir:
|
||||
raise CancelledException
|
||||
logger.info("You must choose a capa rules directory before running capa.")
|
||||
return "" # return empty str to avoid handling both int and str types
|
||||
|
||||
rules_path: pathlib.Path = pathlib.Path(rules_dir)
|
||||
logger.info("running capa using rules from %s", str(rules_path))
|
||||
|
||||
show_monitor_message(f"loading rules from {rules_path}")
|
||||
rules = capa.rules.get_rules([rules_path])
|
||||
|
||||
show_monitor_message("collecting binary metadata")
|
||||
meta = capa.ghidra.helpers.collect_metadata([rules_path])
|
||||
|
||||
show_monitor_message("running capa analysis")
|
||||
extractor = capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor()
|
||||
|
||||
capabilities = capa.capabilities.common.find_capabilities(rules, extractor, True)
|
||||
|
||||
show_monitor_message("checking for static limitations")
|
||||
if capa.capabilities.common.has_static_limitation(rules, capabilities, is_standalone=False):
|
||||
show_warn(
|
||||
"capa explorer encountered warnings during analysis. Please check the console output for more information.",
|
||||
)
|
||||
popup("capa explorer encountered warnings during analysis. Please check the console output for more information.") # type: ignore [name-defined] # noqa: F821
|
||||
logger.info("capa encountered warnings during analysis")
|
||||
|
||||
show_monitor_message("rendering results")
|
||||
return capa.render.json.render(meta, rules, capabilities.matches)
|
||||
|
||||
|
||||
@@ -371,12 +328,12 @@ def parse_json(capa_data):
|
||||
# this requires the correct delimiter used by Ghidra
|
||||
# Ex. 'communication/named-pipe/create/create pipe' -> capa::communication::named-pipe::create::create-pipe
|
||||
namespace_str = Namespace.DELIMITER.join(meta["namespace"].split("/"))
|
||||
namespace = "capa_explorer" + Namespace.DELIMITER + namespace_str + fmt_rule
|
||||
namespace = "capa" + Namespace.DELIMITER + namespace_str + fmt_rule
|
||||
else:
|
||||
# lib rules via the official rules repo will not contain data
|
||||
# for the "namespaces" key, so format using rule itself
|
||||
# Ex. 'contain loop' -> capa::lib::contain-loop
|
||||
namespace = "capa_explorer" + Namespace.DELIMITER + "lib" + fmt_rule
|
||||
namespace = "capa" + Namespace.DELIMITER + "lib" + fmt_rule
|
||||
|
||||
yield CapaMatchData(namespace, scope, rule, rule_matches, attack, mbc)
|
||||
|
||||
@@ -385,79 +342,44 @@ def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
choices = ["namespaces", "bookmarks", "comments"]
|
||||
# use ArrayList to resolve ambiguous askChoices overloads (List vs List, List) in PyGhidra
|
||||
choices_java = ArrayList()
|
||||
for c in choices:
|
||||
choices_java.add(c)
|
||||
if isRunningHeadless(): # type: ignore [name-defined] # noqa: F821
|
||||
logger.error("unsupported Ghidra execution mode")
|
||||
return capa.main.E_UNSUPPORTED_GHIDRA_EXECUTION_MODE
|
||||
|
||||
choice_labels = [
|
||||
'add "capa_explorer" namespace for matched functions',
|
||||
"add bookmarks for matched functions",
|
||||
"add comments to matched functions",
|
||||
]
|
||||
# use ArrayList to resolve ambiguous askChoices overloads (List vs List, List) in PyGhidra
|
||||
choice_labels_java = ArrayList()
|
||||
for c in choice_labels:
|
||||
choice_labels_java.add(c)
|
||||
|
||||
selected = list(askChoices("capa explorer", "select actions:", choices_java, choice_labels_java)) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
do_namespaces = "namespaces" in selected
|
||||
do_comments = "comments" in selected
|
||||
do_bookmarks = "bookmarks" in selected
|
||||
|
||||
if not any((do_namespaces, do_comments, do_bookmarks)):
|
||||
raise CancelledException("no actions selected")
|
||||
|
||||
# initialize the context for the extractor/helpers
|
||||
capa.features.extractors.ghidra.context.set_context(
|
||||
currentProgram, # type: ignore [name-defined] # noqa: F821
|
||||
FlatProgramAPI(currentProgram), # type: ignore [name-defined] # noqa: F821
|
||||
monitor, # type: ignore [name-defined] # noqa: F821
|
||||
)
|
||||
|
||||
show_monitor_message("checking supported Ghidra version")
|
||||
if not capa.ghidra.helpers.is_supported_ghidra_version():
|
||||
show_error("unsupported Ghidra version")
|
||||
logger.error("unsupported Ghidra version")
|
||||
return capa.main.E_UNSUPPORTED_GHIDRA_VERSION
|
||||
|
||||
show_monitor_message("checking supported file type")
|
||||
if not capa.ghidra.helpers.is_supported_file_type():
|
||||
show_error("unsupported file type")
|
||||
logger.error("unsupported file type")
|
||||
return capa.main.E_INVALID_FILE_TYPE
|
||||
|
||||
show_monitor_message("checking supported file architecture")
|
||||
if not capa.ghidra.helpers.is_supported_arch_type():
|
||||
show_error("unsupported file architecture")
|
||||
logger.error("unsupported file architecture")
|
||||
return capa.main.E_INVALID_FILE_ARCH
|
||||
|
||||
# capa_data will always contain {'meta':..., 'rules':...}
|
||||
# if the 'rules' key contains no values, then there were no matches
|
||||
capa_data = json.loads(get_capabilities())
|
||||
if capa_data.get("rules") is None:
|
||||
show_info("capa explorer found no matches.")
|
||||
logger.info("capa explorer found no matches")
|
||||
popup("capa explorer found no matches.") # type: ignore [name-defined] # noqa: F821
|
||||
return capa.main.E_EMPTY_REPORT
|
||||
|
||||
show_monitor_message("processing matches")
|
||||
for item in parse_json(capa_data):
|
||||
if do_bookmarks:
|
||||
show_monitor_message("adding bookmarks")
|
||||
item.bookmark_functions()
|
||||
if do_namespaces or do_comments:
|
||||
show_monitor_message("adding labels")
|
||||
item.label_matches(do_namespaces, do_comments)
|
||||
|
||||
show_info("capa explorer analysis complete.")
|
||||
|
||||
item.bookmark_functions()
|
||||
item.label_matches()
|
||||
logger.info("capa explorer analysis complete")
|
||||
popup("capa explorer analysis complete.\nPlease see results in the Bookmarks Window and Namespaces section of the Symbol Tree Window.") # type: ignore [name-defined] # noqa: F821
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
if main() != 0:
|
||||
show_error(
|
||||
"capa explorer encountered errors during analysis. Please check the console output for more information.",
|
||||
)
|
||||
except CancelledException:
|
||||
show_info("capa explorer analysis cancelled.")
|
||||
if sys.version_info < (3, 10):
|
||||
from capa.exceptions import UnsupportedRuntimeError
|
||||
|
||||
raise UnsupportedRuntimeError("This version of capa can only be used with Python 3.10+")
|
||||
exit_code = main()
|
||||
if exit_code != 0:
|
||||
popup("capa explorer encountered errors during analysis. Please check the console output for more information.") # type: ignore [name-defined] # noqa: F821
|
||||
sys.exit(exit_code)
|
||||
174
capa/ghidra/capa_ghidra.py
Normal file
174
capa/ghidra/capa_ghidra.py
Normal file
@@ -0,0 +1,174 @@
|
||||
# Run capa against loaded Ghidra database and render results in Ghidra Console window
|
||||
# @author Mike Hunhoff (mehunhoff@google.com)
|
||||
# @category Python 3.capa
|
||||
|
||||
# Copyright 2023 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import logging
|
||||
import pathlib
|
||||
import argparse
|
||||
|
||||
import capa
|
||||
import capa.main
|
||||
import capa.rules
|
||||
import capa.ghidra.helpers
|
||||
import capa.render.default
|
||||
import capa.capabilities.common
|
||||
import capa.features.extractors.ghidra.extractor
|
||||
|
||||
logger = logging.getLogger("capa_ghidra")
|
||||
|
||||
|
||||
def run_headless():
|
||||
parser = argparse.ArgumentParser(description="The FLARE team's open-source tool to integrate capa with Ghidra.")
|
||||
|
||||
parser.add_argument(
|
||||
"rules",
|
||||
type=str,
|
||||
help="path to rule file or directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="enable verbose result document (no effect with --json)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-vv", "--vverbose", action="store_true", help="enable very verbose result document (no effect with --json)"
|
||||
)
|
||||
parser.add_argument("-d", "--debug", action="store_true", help="enable debugging output on STDERR")
|
||||
parser.add_argument("-q", "--quiet", action="store_true", help="disable all output but errors")
|
||||
parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text")
|
||||
|
||||
script_args = list(getScriptArgs()) # type: ignore [name-defined] # noqa: F821
|
||||
if not script_args or len(script_args) > 1:
|
||||
script_args = []
|
||||
else:
|
||||
script_args = script_args[0].split()
|
||||
for idx, arg in enumerate(script_args):
|
||||
if arg.lower() == "help":
|
||||
script_args[idx] = "--help"
|
||||
|
||||
args = parser.parse_args(args=script_args)
|
||||
|
||||
if args.quiet:
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
logging.getLogger().setLevel(logging.WARNING)
|
||||
elif args.debug:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
logger.debug("running in Ghidra headless mode")
|
||||
|
||||
rules_path = pathlib.Path(args.rules)
|
||||
|
||||
logger.debug("rule path: %s", rules_path)
|
||||
rules = capa.rules.get_rules([rules_path])
|
||||
|
||||
meta = capa.ghidra.helpers.collect_metadata([rules_path])
|
||||
extractor = capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor()
|
||||
|
||||
capabilities = capa.capabilities.common.find_capabilities(rules, extractor, False)
|
||||
|
||||
meta.analysis.feature_counts = capabilities.feature_counts
|
||||
meta.analysis.library_functions = capabilities.library_functions
|
||||
meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities.matches)
|
||||
|
||||
if capa.capabilities.common.has_static_limitation(rules, capabilities, is_standalone=True):
|
||||
logger.info("capa encountered warnings during analysis")
|
||||
|
||||
if args.json:
|
||||
print(capa.render.json.render(meta, rules, capabilities.matches)) # noqa: T201
|
||||
elif args.vverbose:
|
||||
print(capa.render.vverbose.render(meta, rules, capabilities.matches)) # noqa: T201
|
||||
elif args.verbose:
|
||||
print(capa.render.verbose.render(meta, rules, capabilities.matches)) # noqa: T201
|
||||
else:
|
||||
print(capa.render.default.render(meta, rules, capabilities.matches)) # noqa: T201
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def run_ui():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
rules_dir: str = ""
|
||||
try:
|
||||
selected_dir = askDirectory("Choose capa rules directory", "Ok") # type: ignore [name-defined] # noqa: F821
|
||||
if selected_dir:
|
||||
rules_dir = selected_dir.getPath()
|
||||
except RuntimeError:
|
||||
# RuntimeError thrown when user selects "Cancel"
|
||||
pass
|
||||
|
||||
if not rules_dir:
|
||||
logger.info("You must choose a capa rules directory before running capa.")
|
||||
return capa.main.E_MISSING_RULES
|
||||
|
||||
verbose = askChoice( # type: ignore [name-defined] # noqa: F821
|
||||
"capa output verbosity", "Choose capa output verbosity", ["default", "verbose", "vverbose"], "default"
|
||||
)
|
||||
|
||||
rules_path: pathlib.Path = pathlib.Path(rules_dir)
|
||||
logger.info("running capa using rules from %s", str(rules_path))
|
||||
|
||||
rules = capa.rules.get_rules([rules_path])
|
||||
|
||||
meta = capa.ghidra.helpers.collect_metadata([rules_path])
|
||||
extractor = capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor()
|
||||
|
||||
capabilities = capa.capabilities.common.find_capabilities(rules, extractor, True)
|
||||
|
||||
meta.analysis.feature_counts = capabilities.feature_counts
|
||||
meta.analysis.library_functions = capabilities.library_functions
|
||||
meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities.matches)
|
||||
|
||||
if capa.capabilities.common.has_static_limitation(rules, capabilities, is_standalone=False):
|
||||
logger.info("capa encountered warnings during analysis")
|
||||
|
||||
if verbose == "vverbose":
|
||||
print(capa.render.vverbose.render(meta, rules, capabilities.matches)) # noqa: T201
|
||||
elif verbose == "verbose":
|
||||
print(capa.render.verbose.render(meta, rules, capabilities.matches)) # noqa: T201
|
||||
else:
|
||||
print(capa.render.default.render(meta, rules, capabilities.matches)) # noqa: T201
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
if not capa.ghidra.helpers.is_supported_ghidra_version():
|
||||
return capa.main.E_UNSUPPORTED_GHIDRA_VERSION
|
||||
|
||||
if not capa.ghidra.helpers.is_supported_file_type():
|
||||
return capa.main.E_INVALID_FILE_TYPE
|
||||
|
||||
if not capa.ghidra.helpers.is_supported_arch_type():
|
||||
return capa.main.E_INVALID_FILE_ARCH
|
||||
|
||||
if isRunningHeadless(): # type: ignore [name-defined] # noqa: F821
|
||||
return run_headless()
|
||||
else:
|
||||
return run_ui()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if sys.version_info < (3, 10):
|
||||
from capa.exceptions import UnsupportedRuntimeError
|
||||
|
||||
raise UnsupportedRuntimeError("This version of capa can only be used with Python 3.10+")
|
||||
sys.exit(main())
|
||||
@@ -22,7 +22,6 @@ import capa.version
|
||||
import capa.features.common
|
||||
import capa.features.freeze
|
||||
import capa.render.result_document as rdoc
|
||||
import capa.features.extractors.ghidra.context as ghidra_context
|
||||
import capa.features.extractors.ghidra.helpers
|
||||
from capa.features.address import AbsoluteVirtualAddress
|
||||
|
||||
@@ -32,18 +31,6 @@ logger = logging.getLogger("capa")
|
||||
SUPPORTED_FILE_TYPES = ("Executable and Linking Format (ELF)", "Portable Executable (PE)", "Raw Binary")
|
||||
|
||||
|
||||
def get_current_program():
|
||||
return ghidra_context.get_context().program
|
||||
|
||||
|
||||
def get_flat_api():
|
||||
return ghidra_context.get_context().flat_api
|
||||
|
||||
|
||||
def get_monitor():
|
||||
return ghidra_context.get_context().monitor
|
||||
|
||||
|
||||
class GHIDRAIO:
|
||||
"""
|
||||
An object that acts as a file-like object,
|
||||
@@ -61,12 +48,7 @@ class GHIDRAIO:
|
||||
self.offset = offset
|
||||
|
||||
def read(self, size):
|
||||
logger.debug(
|
||||
"reading 0x%x bytes at 0x%x (ea: 0x%x)",
|
||||
size,
|
||||
self.offset,
|
||||
get_current_program().getImageBase().add(self.offset).getOffset(),
|
||||
)
|
||||
logger.debug("reading 0x%x bytes at 0x%x (ea: 0x%x)", size, self.offset, currentProgram().getImageBase().add(self.offset).getOffset()) # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
if size > len(self.bytes_) - self.offset:
|
||||
logger.debug("cannot read 0x%x bytes at 0x%x (ea: BADADDR)", size, self.offset)
|
||||
@@ -78,7 +60,7 @@ class GHIDRAIO:
|
||||
return
|
||||
|
||||
def get_bytes(self):
|
||||
file_bytes = get_current_program().getMemory().getAllFileBytes()[0]
|
||||
file_bytes = currentProgram().getMemory().getAllFileBytes()[0] # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
# getOriginalByte() allows for raw file parsing on the Ghidra side
|
||||
# other functions will fail as Ghidra will think that it's reading uninitialized memory
|
||||
@@ -88,32 +70,21 @@ class GHIDRAIO:
|
||||
|
||||
|
||||
def is_supported_ghidra_version():
|
||||
import ghidra.framework
|
||||
|
||||
version = ghidra.framework.Application.getApplicationVersion()
|
||||
try:
|
||||
# version format example: "11.1.2" or "11.4"
|
||||
major, minor = map(int, version.split(".")[:2])
|
||||
if major < 12:
|
||||
logger.error("-" * 80)
|
||||
logger.error(" Ghidra version %s is not supported.", version)
|
||||
logger.error(" ")
|
||||
logger.error(" capa requires Ghidra 12.0 or higher.")
|
||||
logger.error("-" * 80)
|
||||
return False
|
||||
except ValueError:
|
||||
logger.warning("could not parse Ghidra version: %s", version)
|
||||
version = float(getGhidraVersion()[:4]) # type: ignore [name-defined] # noqa: F821
|
||||
if version < 10.2:
|
||||
warning_msg = "capa does not support this Ghidra version"
|
||||
logger.warning(warning_msg)
|
||||
logger.warning("Your Ghidra version is: %s. Supported versions are: Ghidra >= 10.2", version)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def is_running_headless():
|
||||
return True # PyGhidra is always headless in this context
|
||||
return isRunningHeadless() # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
|
||||
def is_supported_file_type():
|
||||
file_info = get_current_program().getExecutableFormat()
|
||||
file_info = currentProgram().getExecutableFormat() # type: ignore [name-defined] # noqa: F821
|
||||
if file_info not in SUPPORTED_FILE_TYPES:
|
||||
logger.error("-" * 80)
|
||||
logger.error(" Input file does not appear to be a supported file type.")
|
||||
@@ -128,7 +99,7 @@ def is_supported_file_type():
|
||||
|
||||
|
||||
def is_supported_arch_type():
|
||||
lang_id = str(get_current_program().getLanguageID()).lower()
|
||||
lang_id = str(currentProgram().getLanguageID()).lower() # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
if not all((lang_id.startswith("x86"), any(arch in lang_id for arch in ("32", "64")))):
|
||||
logger.error("-" * 80)
|
||||
@@ -141,18 +112,18 @@ def is_supported_arch_type():
|
||||
|
||||
|
||||
def get_file_md5():
|
||||
return get_current_program().getExecutableMD5()
|
||||
return currentProgram().getExecutableMD5() # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
|
||||
def get_file_sha256():
|
||||
return get_current_program().getExecutableSHA256()
|
||||
return currentProgram().getExecutableSHA256() # type: ignore [name-defined] # noqa: F821
|
||||
|
||||
|
||||
def collect_metadata(rules: list[Path]):
|
||||
md5 = get_file_md5()
|
||||
sha256 = get_file_sha256()
|
||||
|
||||
info = get_current_program().getLanguageID().toString()
|
||||
info = currentProgram().getLanguageID().toString() # type: ignore [name-defined] # noqa: F821
|
||||
if "x86" in info and "64" in info:
|
||||
arch = "x86_64"
|
||||
elif "x86" in info and "32" in info:
|
||||
@@ -160,11 +131,11 @@ def collect_metadata(rules: list[Path]):
|
||||
else:
|
||||
arch = "unknown arch"
|
||||
|
||||
format_name: str = get_current_program().getExecutableFormat()
|
||||
format_name: str = currentProgram().getExecutableFormat() # type: ignore [name-defined] # noqa: F821
|
||||
if "PE" in format_name:
|
||||
os = "windows"
|
||||
elif "ELF" in format_name:
|
||||
with contextlib.closing(GHIDRAIO()) as f:
|
||||
with contextlib.closing(capa.ghidra.helpers.GHIDRAIO()) as f:
|
||||
os = capa.features.extractors.elf.detect_elf_os(f)
|
||||
else:
|
||||
os = "unknown os"
|
||||
@@ -177,18 +148,16 @@ def collect_metadata(rules: list[Path]):
|
||||
md5=md5,
|
||||
sha1="",
|
||||
sha256=sha256,
|
||||
path=get_current_program().getExecutablePath(),
|
||||
path=currentProgram().getExecutablePath(), # type: ignore [name-defined] # noqa: F821
|
||||
),
|
||||
flavor=rdoc.Flavor.STATIC,
|
||||
analysis=rdoc.StaticAnalysis(
|
||||
format=get_current_program().getExecutableFormat(),
|
||||
format=currentProgram().getExecutableFormat(), # type: ignore [name-defined] # noqa: F821
|
||||
arch=arch,
|
||||
os=os,
|
||||
extractor="ghidra",
|
||||
rules=tuple(r.resolve().absolute().as_posix() for r in rules),
|
||||
base_address=capa.features.freeze.Address.from_capa(
|
||||
AbsoluteVirtualAddress(get_current_program().getImageBase().getOffset())
|
||||
),
|
||||
base_address=capa.features.freeze.Address.from_capa(AbsoluteVirtualAddress(currentProgram().getImageBase().getOffset())), # type: ignore [name-defined] # noqa: F821
|
||||
layout=rdoc.StaticLayout(
|
||||
functions=(),
|
||||
),
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
<div align="center">
|
||||
<img src="https://github.com/mandiant/capa/blob/master/doc/img/ghidra_backend_logo.png" width=240 height=125>
|
||||
</div>
|
||||
|
||||
# capa explorer for Ghidra
|
||||
|
||||
capa explorer for Ghidra brings capa’s detection capabilities directly to Ghidra’s user interface helping speed up your reverse engineering tasks by identifying what parts of a program suggest interesting behavior, such as setting a registry value. You can execute (via [PyGhidra](https://github.com/NationalSecurityAgency/ghidra/tree/master/Ghidra/Features/PyGhidra)) the script [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/plugin/capa_explorer.py) using Ghidra’s Script Manager window to run capa’s analysis and view the results in Ghidra.
|
||||
|
||||
## ui integration
|
||||
[capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/capa_explorer.py) renders capa results in Ghidra's UI to help you quickly navigate them. This includes adding matched functions to Ghidra’s Symbol Tree and Bookmarks windows and adding comments to functions that indicate matched capabilities and features. You can execute this script using Ghidra’s Script Manager window.
|
||||
|
||||
### symbol tree window
|
||||
Matched functions are added to Ghidra's Symbol Tree window under a custom namespace that maps to the capabilities' [capa namespace](https://github.com/mandiant/capa-rules/blob/master/doc/format.md#rule-namespace).
|
||||
<div align="center">
|
||||
<img src="https://github.com/mandiant/capa/assets/66766340/eeae33f4-99d4-42dc-a5e8-4c1b8c661492" width=300>
|
||||
</div>
|
||||
|
||||
### comments
|
||||
|
||||
Comments are added at the beginning of matched functions indicating matched capabilities and inline comments are added to functions indicating matched features. You can view these comments in Ghidra’s Disassembly Listing and Decompile windows.
|
||||
<div align="center">
|
||||
<img src="https://github.com/mandiant/capa/assets/66766340/bb2b4170-7fd4-45fc-8c7b-ff8f2e2f101b" width=1000>
|
||||
</div>
|
||||
|
||||
### bookmarks
|
||||
|
||||
Bookmarks are added to functions that matched a capability that is mapped to a MITRE ATT&CK and/or Malware Behavior Catalog (MBC) technique. You can view these bookmarks in Ghidra's Bookmarks window.
|
||||
<div align="center">
|
||||
<img src="https://github.com/mandiant/capa/assets/66766340/7f9a66a9-7be7-4223-91c6-4b8fc4651336" width=825>
|
||||
</div>
|
||||
|
||||
# getting started
|
||||
|
||||
## requirements
|
||||
|
||||
- [Ghidra](https://github.com/NationalSecurityAgency/ghidra) >= 12.0 must be installed.
|
||||
- [flare-capa](https://pypi.org/project/flare-capa/) >= 10.0 must be installed (virtual environment recommended) with the `ghidra` extra (e.g., `pip install "flare-capa[ghidra]"`).
|
||||
- [capa rules](https://github.com/mandiant/capa-rules) must be downloaded for the version of capa you are using.
|
||||
|
||||
## execution
|
||||
|
||||
### 1. run Ghidra with PyGhidra
|
||||
You must start Ghidra using the `pyghidraRun` script provided in the support directory of your Ghidra installation to ensure the Python environment is correctly loaded. You should execute `pyghidraRun` from within the Python environment that you used to install capa.
|
||||
|
||||
```bash
|
||||
<ghidra_install>/support/pyghidraRun
|
||||
```
|
||||
|
||||
### 2. run capa_explorer.py
|
||||
1. Open your Ghidra project and CodeBrowser.
|
||||
2. Open the Script Manager.
|
||||
3. Add [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ghidra/plugin/capa_explorer.py) to the script directories.
|
||||
4. Filter for capa and run the script.
|
||||
5. When prompted, select the directory containing the downloaded capa rules.
|
||||
@@ -96,7 +96,11 @@ def is_runtime_ida():
|
||||
|
||||
|
||||
def is_runtime_ghidra():
|
||||
return importlib.util.find_spec("ghidra") is not None
|
||||
try:
|
||||
currentProgram # type: ignore [name-defined] # noqa: F821
|
||||
except NameError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def assert_never(value) -> NoReturn:
|
||||
@@ -327,9 +331,6 @@ def log_unsupported_os_error():
|
||||
logger.error(" ")
|
||||
logger.error(" capa currently only analyzes executables for some operating systems")
|
||||
logger.error(" (including Windows, Linux, and Android).")
|
||||
logger.error(" ")
|
||||
logger.error(" If you know the target OS, you can specify it explicitly, for example:")
|
||||
logger.error(" capa --os linux <sample>")
|
||||
logger.error("-" * 80)
|
||||
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ import logging
|
||||
import idaapi
|
||||
import ida_kernwin
|
||||
|
||||
from capa.ida.plugin.form import CapaExplorerForm
|
||||
from capa.ida.plugin.icon import ICON
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -73,9 +74,6 @@ class CapaExplorerPlugin(idaapi.plugin_t):
|
||||
arg (int): bitflag. Setting LSB enables automatic analysis upon
|
||||
loading. The other bits are currently undefined. See `form.Options`.
|
||||
"""
|
||||
# delay import to not trigger load of Qt components when not running in idaq, i.e., in idalib
|
||||
from capa.ida.plugin.form import CapaExplorerForm
|
||||
|
||||
if not self.form:
|
||||
self.form = CapaExplorerForm(self.PLUGIN_NAME, arg)
|
||||
else:
|
||||
|
||||
@@ -14,9 +14,9 @@
|
||||
|
||||
|
||||
import ida_kernwin
|
||||
from PyQt5 import QtCore
|
||||
|
||||
from capa.ida.plugin.error import UserCancelledError
|
||||
from capa.ida.plugin.qt_compat import QtCore, Signal
|
||||
from capa.features.extractors.ida.extractor import IdaFeatureExtractor
|
||||
from capa.features.extractors.base_extractor import FunctionHandle
|
||||
|
||||
@@ -24,7 +24,7 @@ from capa.features.extractors.base_extractor import FunctionHandle
|
||||
class CapaExplorerProgressIndicator(QtCore.QObject):
|
||||
"""implement progress signal, used during feature extraction"""
|
||||
|
||||
progress = Signal(str)
|
||||
progress = QtCore.pyqtSignal(str)
|
||||
|
||||
def update(self, text):
|
||||
"""emit progress update
|
||||
|
||||
@@ -23,6 +23,7 @@ from pathlib import Path
|
||||
import idaapi
|
||||
import ida_kernwin
|
||||
import ida_settings
|
||||
from PyQt5 import QtGui, QtCore, QtWidgets
|
||||
|
||||
import capa.main
|
||||
import capa.rules
|
||||
@@ -50,7 +51,6 @@ from capa.ida.plugin.hooks import CapaExplorerIdaHooks
|
||||
from capa.ida.plugin.model import CapaExplorerDataModel
|
||||
from capa.ida.plugin.proxy import CapaExplorerRangeProxyModel, CapaExplorerSearchProxyModel
|
||||
from capa.ida.plugin.extractor import CapaExplorerFeatureExtractor
|
||||
from capa.ida.plugin.qt_compat import QtGui, QtCore, QtWidgets
|
||||
from capa.features.extractors.base_extractor import FunctionHandle
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -1358,7 +1358,7 @@ class CapaExplorerForm(idaapi.PluginForm):
|
||||
|
||||
@param state: checked state
|
||||
"""
|
||||
if state:
|
||||
if state == QtCore.Qt.Checked:
|
||||
self.limit_results_to_function(idaapi.get_func(idaapi.get_screen_ea()))
|
||||
else:
|
||||
self.range_model_proxy.reset_address_range_filter()
|
||||
@@ -1367,7 +1367,7 @@ class CapaExplorerForm(idaapi.PluginForm):
|
||||
|
||||
def slot_checkbox_limit_features_by_ea(self, state):
|
||||
""" """
|
||||
if state:
|
||||
if state == QtCore.Qt.Checked:
|
||||
self.view_rulegen_features.filter_items_by_ea(idaapi.get_screen_ea())
|
||||
else:
|
||||
self.view_rulegen_features.show_all_items()
|
||||
|
||||
@@ -1,38 +0,0 @@
|
||||
{
|
||||
"IDAMetadataDescriptorVersion": 1,
|
||||
"plugin": {
|
||||
"name": "capa",
|
||||
"entryPoint": "capa_explorer.py",
|
||||
"version": "9.3.1",
|
||||
"idaVersions": ">=7.4",
|
||||
"description": "Identify capabilities in executable files using FLARE's capa framework",
|
||||
"license": "Apache-2.0",
|
||||
"categories": [
|
||||
"malware-analysis",
|
||||
"api-scripting-and-automation",
|
||||
"ui-ux-and-visualization"
|
||||
],
|
||||
"pythonDependencies": ["flare-capa==9.3.1"],
|
||||
"urls": {
|
||||
"repository": "https://github.com/mandiant/capa"
|
||||
},
|
||||
"authors": [
|
||||
{"name": "Willi Ballenthin", "email": "wballenthin@hex-rays.com"},
|
||||
{"name": "Moritz Raabe", "email": "moritzraabe@google.com"},
|
||||
{"name": "Mike Hunhoff", "email": "mike.hunhoff@gmail.com"},
|
||||
{"name": "Yacine Elhamer", "email": "elhamer.yacine@gmail.com"}
|
||||
],
|
||||
"keywords": [
|
||||
"capability-detection",
|
||||
"malware-analysis",
|
||||
"behavior-analysis",
|
||||
"reverse-engineering",
|
||||
"att&ck",
|
||||
"rule-engine",
|
||||
"feature-extraction",
|
||||
"yara-like-rules",
|
||||
"static-analysis",
|
||||
"dynamic-analysis"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -18,10 +18,10 @@ from typing import Iterator, Optional
|
||||
|
||||
import idc
|
||||
import idaapi
|
||||
from PyQt5 import QtCore
|
||||
|
||||
import capa.ida.helpers
|
||||
from capa.features.address import Address, FileOffsetAddress, AbsoluteVirtualAddress
|
||||
from capa.ida.plugin.qt_compat import QtCore, qt_get_item_flag_tristate
|
||||
|
||||
|
||||
def info_to_name(display):
|
||||
@@ -55,7 +55,7 @@ class CapaExplorerDataItem:
|
||||
self.flags = QtCore.Qt.ItemIsEnabled | QtCore.Qt.ItemIsSelectable
|
||||
|
||||
if self._can_check:
|
||||
self.flags = self.flags | QtCore.Qt.ItemIsUserCheckable | qt_get_item_flag_tristate()
|
||||
self.flags = self.flags | QtCore.Qt.ItemIsUserCheckable | QtCore.Qt.ItemIsTristate
|
||||
|
||||
if self.pred:
|
||||
self.pred.appendChild(self)
|
||||
|
||||
@@ -18,6 +18,7 @@ from collections import deque
|
||||
|
||||
import idc
|
||||
import idaapi
|
||||
from PyQt5 import QtGui, QtCore
|
||||
|
||||
import capa.rules
|
||||
import capa.ida.helpers
|
||||
@@ -41,7 +42,6 @@ from capa.ida.plugin.item import (
|
||||
CapaExplorerInstructionViewItem,
|
||||
)
|
||||
from capa.features.address import Address, AbsoluteVirtualAddress
|
||||
from capa.ida.plugin.qt_compat import QtGui, QtCore
|
||||
|
||||
# default highlight color used in IDA window
|
||||
DEFAULT_HIGHLIGHT = 0xE6C700
|
||||
@@ -269,7 +269,7 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
|
||||
visited.add(child_index)
|
||||
|
||||
for idx in range(self.rowCount(child_index)):
|
||||
stack.append(self.index(idx, 0, child_index))
|
||||
stack.append(child_index.child(idx, 0))
|
||||
|
||||
def reset_ida_highlighting(self, item, checked):
|
||||
"""reset IDA highlight for item
|
||||
|
||||
@@ -12,8 +12,10 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from PyQt5 import QtCore
|
||||
from PyQt5.QtCore import Qt
|
||||
|
||||
from capa.ida.plugin.model import CapaExplorerDataModel
|
||||
from capa.ida.plugin.qt_compat import Qt, QtCore
|
||||
|
||||
|
||||
class CapaExplorerRangeProxyModel(QtCore.QSortFilterProxyModel):
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
# Copyright 2020 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Qt compatibility layer for capa IDA Pro plugin.
|
||||
|
||||
Handles PyQt5 (IDA < 9.2) vs PySide6 (IDA >= 9.2) differences.
|
||||
This module provides a unified import interface for Qt modules and handles
|
||||
API changes between Qt5 and Qt6.
|
||||
"""
|
||||
|
||||
try:
|
||||
# IDA 9.2+ uses PySide6
|
||||
from PySide6 import QtGui, QtCore, QtWidgets
|
||||
from PySide6.QtGui import QAction
|
||||
|
||||
QT_LIBRARY = "PySide6"
|
||||
Signal = QtCore.Signal
|
||||
except ImportError:
|
||||
# Older IDA versions use PyQt5
|
||||
try:
|
||||
from PyQt5 import QtGui, QtCore, QtWidgets
|
||||
from PyQt5.QtWidgets import QAction
|
||||
|
||||
QT_LIBRARY = "PyQt5"
|
||||
Signal = QtCore.pyqtSignal
|
||||
except ImportError:
|
||||
raise ImportError("Neither PySide6 nor PyQt5 is available. Cannot initialize capa IDA plugin.")
|
||||
|
||||
Qt = QtCore.Qt
|
||||
|
||||
|
||||
def qt_get_item_flag_tristate():
|
||||
"""
|
||||
Get the tristate item flag compatible with Qt5 and Qt6.
|
||||
|
||||
Qt5 (PyQt5): Uses Qt.ItemIsTristate
|
||||
Qt6 (PySide6): Qt.ItemIsTristate was removed, uses Qt.ItemIsAutoTristate
|
||||
|
||||
ItemIsAutoTristate automatically manages tristate based on child checkboxes,
|
||||
matching the original ItemIsTristate behavior where parent checkboxes reflect
|
||||
the check state of their children.
|
||||
|
||||
Returns:
|
||||
int: The appropriate flag value for the Qt version
|
||||
|
||||
Raises:
|
||||
AttributeError: If the tristate flag cannot be found in the Qt library
|
||||
"""
|
||||
if QT_LIBRARY == "PySide6":
|
||||
# Qt6: ItemIsTristate was removed, replaced with ItemIsAutoTristate
|
||||
# Try different possible locations (API varies slightly across PySide6 versions)
|
||||
if hasattr(Qt, "ItemIsAutoTristate"):
|
||||
return Qt.ItemIsAutoTristate
|
||||
elif hasattr(Qt, "ItemFlag") and hasattr(Qt.ItemFlag, "ItemIsAutoTristate"):
|
||||
return Qt.ItemFlag.ItemIsAutoTristate
|
||||
else:
|
||||
raise AttributeError(
|
||||
"Cannot find ItemIsAutoTristate in PySide6. "
|
||||
+ "Your PySide6 version may be incompatible with capa. "
|
||||
+ f"Available Qt attributes: {[attr for attr in dir(Qt) if 'Item' in attr]}"
|
||||
)
|
||||
else:
|
||||
# Qt5: Use the original ItemIsTristate flag
|
||||
return Qt.ItemIsTristate
|
||||
|
||||
|
||||
__all__ = ["qt_get_item_flag_tristate", "Signal", "QAction", "QtGui", "QtCore", "QtWidgets"]
|
||||
@@ -18,6 +18,7 @@ from collections import Counter
|
||||
|
||||
import idc
|
||||
import idaapi
|
||||
from PyQt5 import QtGui, QtCore, QtWidgets
|
||||
|
||||
import capa.rules
|
||||
import capa.engine
|
||||
@@ -27,7 +28,6 @@ import capa.features.basicblock
|
||||
from capa.ida.plugin.item import CapaExplorerFunctionItem
|
||||
from capa.features.address import AbsoluteVirtualAddress, _NoAddress
|
||||
from capa.ida.plugin.model import CapaExplorerDataModel
|
||||
from capa.ida.plugin.qt_compat import QtGui, QtCore, Signal, QAction, QtWidgets
|
||||
|
||||
MAX_SECTION_SIZE = 750
|
||||
|
||||
@@ -147,7 +147,7 @@ def calc_item_depth(o):
|
||||
|
||||
def build_action(o, display, data, slot):
|
||||
""" """
|
||||
action = QAction(display, o)
|
||||
action = QtWidgets.QAction(display, o)
|
||||
|
||||
action.setData(data)
|
||||
action.triggered.connect(lambda checked: slot(action))
|
||||
@@ -312,7 +312,7 @@ class CapaExplorerRulegenPreview(QtWidgets.QTextEdit):
|
||||
|
||||
|
||||
class CapaExplorerRulegenEditor(QtWidgets.QTreeWidget):
|
||||
updated = Signal()
|
||||
updated = QtCore.pyqtSignal()
|
||||
|
||||
def __init__(self, preview, parent=None):
|
||||
""" """
|
||||
|
||||
176
capa/loader.py
176
capa/loader.py
@@ -12,6 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import io
|
||||
import os
|
||||
import logging
|
||||
import datetime
|
||||
@@ -22,13 +23,24 @@ from pathlib import Path
|
||||
from rich.console import Console
|
||||
from typing_extensions import assert_never
|
||||
|
||||
import capa.perf
|
||||
import capa.rules
|
||||
import capa.engine
|
||||
import capa.helpers
|
||||
import capa.version
|
||||
import capa.render.json
|
||||
import capa.rules.cache
|
||||
import capa.render.default
|
||||
import capa.render.verbose
|
||||
import capa.features.common
|
||||
import capa.features.freeze as frz
|
||||
import capa.render.vverbose
|
||||
import capa.features.extractors
|
||||
import capa.render.result_document
|
||||
import capa.render.result_document as rdoc
|
||||
import capa.features.extractors.common
|
||||
import capa.features.extractors.base_extractor
|
||||
import capa.features.extractors.cape.extractor
|
||||
from capa.rules import RuleSet
|
||||
from capa.engine import MatchResults
|
||||
from capa.exceptions import UnsupportedOSError, UnsupportedArchError, UnsupportedFormatError
|
||||
@@ -67,7 +79,7 @@ BACKEND_VMRAY = "vmray"
|
||||
BACKEND_FREEZE = "freeze"
|
||||
BACKEND_BINEXPORT2 = "binexport2"
|
||||
BACKEND_IDA = "ida"
|
||||
BACKEND_GHIDRA = "ghidra"
|
||||
BACKEND_LANCELOT = "lancelot"
|
||||
|
||||
|
||||
class CorruptFile(ValueError):
|
||||
@@ -126,57 +138,6 @@ def get_meta_str(vw):
|
||||
return f"{', '.join(meta)}, number of functions: {len(vw.getFunctions())}"
|
||||
|
||||
|
||||
def _is_probably_corrupt_pe(path: Path) -> bool:
|
||||
"""
|
||||
Heuristic check for obviously malformed PE samples that provoke
|
||||
pathological behavior in vivisect (see GH-1989).
|
||||
|
||||
We treat a PE as "probably corrupt" when any section declares an
|
||||
unrealistically large virtual size compared to the file size, e.g.
|
||||
hundreds of megabytes in a tiny file. Such cases lead vivisect to
|
||||
try to map enormous regions and can exhaust CPU/memory.
|
||||
"""
|
||||
try:
|
||||
import pefile
|
||||
except Exception:
|
||||
# If pefile is unavailable, fall back to existing behavior.
|
||||
return False
|
||||
|
||||
try:
|
||||
pe = pefile.PE(str(path), fast_load=True)
|
||||
except pefile.PEFormatError:
|
||||
# Not a PE file (or badly formed); let existing checks handle it.
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
try:
|
||||
file_size = path.stat().st_size
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
if file_size <= 0:
|
||||
return False
|
||||
|
||||
# Flag sections whose declared virtual size is wildly disproportionate
|
||||
# to the file size (e.g. 900MB section in a ~400KB sample).
|
||||
_VSIZE_FILE_RATIO = 128
|
||||
_MAX_REASONABLE_VSIZE = 512 * 1024 * 1024 # 512 MB
|
||||
max_reasonable = max(file_size * _VSIZE_FILE_RATIO, _MAX_REASONABLE_VSIZE)
|
||||
|
||||
for section in getattr(pe, "sections", []):
|
||||
vsize = getattr(section, "Misc_VirtualSize", 0) or 0
|
||||
if vsize > max_reasonable:
|
||||
logger.debug(
|
||||
"detected unrealistic PE section virtual size: 0x%x (file size: 0x%x), treating as corrupt",
|
||||
vsize,
|
||||
file_size,
|
||||
)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_workspace(path: Path, input_format: str, sigpaths: list[Path]):
|
||||
"""
|
||||
load the program at the given path into a vivisect workspace using the given format.
|
||||
@@ -194,18 +155,11 @@ def get_workspace(path: Path, input_format: str, sigpaths: list[Path]):
|
||||
"""
|
||||
|
||||
# lazy import enables us to not require viv if user wants another backend.
|
||||
import envi.exc
|
||||
import viv_utils
|
||||
import viv_utils.flirt
|
||||
|
||||
logger.debug("generating vivisect workspace for: %s", path)
|
||||
|
||||
if input_format in (FORMAT_PE, FORMAT_AUTO) and _is_probably_corrupt_pe(path):
|
||||
raise CorruptFile(
|
||||
"PE file appears to contain unrealistically large sections and is likely corrupt"
|
||||
+ " - skipping analysis to avoid excessive resource usage."
|
||||
)
|
||||
|
||||
try:
|
||||
if input_format == FORMAT_AUTO:
|
||||
if not is_supported_format(path):
|
||||
@@ -222,20 +176,11 @@ def get_workspace(path: Path, input_format: str, sigpaths: list[Path]):
|
||||
vw = viv_utils.getShellcodeWorkspaceFromFile(str(path), arch="amd64", analyze=False)
|
||||
else:
|
||||
raise ValueError("unexpected format: " + input_format)
|
||||
except envi.exc.SegmentationViolation as e:
|
||||
raise CorruptFile(f"Invalid memory access during binary parsing: {e}") from e
|
||||
except Exception as e:
|
||||
# vivisect raises raw Exception instances, and we don't want
|
||||
# to do a subclass check via isinstance.
|
||||
if type(e) is Exception and e.args:
|
||||
error_msg = str(e.args[0])
|
||||
|
||||
if "Couldn't convert rva" in error_msg:
|
||||
raise CorruptFile(error_msg) from e
|
||||
elif "Unsupported Architecture" in error_msg:
|
||||
# Extract architecture number if available
|
||||
arch_info = e.args[1] if len(e.args) > 1 else "unknown"
|
||||
raise CorruptFile(f"Unsupported architecture: {arch_info}") from e
|
||||
if type(e) is Exception and "Couldn't convert rva" in e.args[0]:
|
||||
raise CorruptFile(e.args[0]) from e
|
||||
raise
|
||||
|
||||
viv_utils.flirt.register_flirt_signature_analyzers(vw, [str(s) for s in sigpaths])
|
||||
@@ -394,24 +339,12 @@ def get_extractor(
|
||||
import capa.features.extractors.ida.extractor
|
||||
|
||||
logger.debug("idalib: opening database...")
|
||||
idapro.enable_console_messages(False)
|
||||
with console.status("analyzing program...", spinner="dots"):
|
||||
# we set the primary and secondary Lumina servers to 0.0.0.0 to disable Lumina,
|
||||
# which sometimes provides bad names, including overwriting names from debug info.
|
||||
#
|
||||
# use -R to load resources, which can help us embedded PE files.
|
||||
#
|
||||
# return values from open_database:
|
||||
# 0 - Success
|
||||
# 2 - User cancelled or 32-64 bit conversion failed
|
||||
# 4 - Database initialization failed
|
||||
# -1 - Generic errors (database already open, auto-analysis failed, etc.)
|
||||
# -2 - User cancelled operation
|
||||
ret = idapro.open_database(
|
||||
str(input_path), run_auto_analysis=True, args="-Olumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0 -R"
|
||||
)
|
||||
if ret != 0:
|
||||
raise RuntimeError("failed to analyze input file")
|
||||
# idalib writes to stdout (ugh), so we have to capture that
|
||||
# so as not to screw up structured output.
|
||||
with capa.helpers.stdout_redirector(io.BytesIO()):
|
||||
with console.status("analyzing program...", spinner="dots"):
|
||||
if idapro.open_database(str(input_path), run_auto_analysis=True):
|
||||
raise RuntimeError("failed to analyze input file")
|
||||
|
||||
logger.debug("idalib: waiting for analysis...")
|
||||
ida_auto.auto_wait()
|
||||
@@ -419,69 +352,18 @@ def get_extractor(
|
||||
|
||||
return capa.features.extractors.ida.extractor.IdaFeatureExtractor()
|
||||
|
||||
elif backend == BACKEND_GHIDRA:
|
||||
import pyghidra
|
||||
elif backend == BACKEND_LANCELOT:
|
||||
import lancelot
|
||||
|
||||
with console.status("analyzing program...", spinner="dots"):
|
||||
if not pyghidra.started():
|
||||
pyghidra.start()
|
||||
import capa.features.extractors.binexport2
|
||||
import capa.features.extractors.binexport2.extractor
|
||||
|
||||
import capa.ghidra.helpers
|
||||
buf = input_path.read_bytes()
|
||||
be2_buf: bytes = lancelot.binexport2_from_bytes(buf)
|
||||
be2 = capa.features.extractors.binexport2.get_binexport2_from_bytes(be2_buf)
|
||||
|
||||
if not capa.ghidra.helpers.is_supported_ghidra_version():
|
||||
raise RuntimeError("unsupported Ghidra version")
|
||||
return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf)
|
||||
|
||||
import tempfile
|
||||
|
||||
tmpdir = tempfile.TemporaryDirectory()
|
||||
|
||||
project_cm = pyghidra.open_project(tmpdir.name, "CapaProject", create=True)
|
||||
project = project_cm.__enter__()
|
||||
try:
|
||||
from ghidra.util.task import TaskMonitor
|
||||
|
||||
monitor = TaskMonitor.DUMMY
|
||||
|
||||
# Import file
|
||||
loader = pyghidra.program_loader().project(project).source(str(input_path)).name(input_path.name)
|
||||
with loader.load() as load_results:
|
||||
load_results.save(monitor)
|
||||
|
||||
# Open program
|
||||
program, consumer = pyghidra.consume_program(project, "/" + input_path.name)
|
||||
|
||||
# Analyze
|
||||
pyghidra.analyze(program, monitor)
|
||||
|
||||
from ghidra.program.flatapi import FlatProgramAPI
|
||||
|
||||
flat_api = FlatProgramAPI(program)
|
||||
|
||||
import capa.features.extractors.ghidra.context as ghidra_context
|
||||
|
||||
ghidra_context.set_context(program, flat_api, monitor)
|
||||
|
||||
# Wrapper to handle cleanup of program (consumer) and project
|
||||
class GhidraContextWrapper:
|
||||
def __init__(self, project_cm, program, consumer):
|
||||
self.project_cm = project_cm
|
||||
self.program = program
|
||||
self.consumer = consumer
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.program.release(self.consumer)
|
||||
self.project_cm.__exit__(exc_type, exc_val, exc_tb)
|
||||
|
||||
cm = GhidraContextWrapper(project_cm, program, consumer)
|
||||
|
||||
except Exception:
|
||||
project_cm.__exit__(None, None, None)
|
||||
tmpdir.cleanup()
|
||||
raise
|
||||
|
||||
import capa.features.extractors.ghidra.extractor
|
||||
|
||||
return capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor(ctx_manager=cm, tmpdir=tmpdir)
|
||||
else:
|
||||
raise ValueError("unexpected backend: " + backend)
|
||||
|
||||
|
||||
27
capa/main.py
27
capa/main.py
@@ -55,9 +55,9 @@ from capa.loader import (
|
||||
BACKEND_VMRAY,
|
||||
BACKEND_DOTNET,
|
||||
BACKEND_FREEZE,
|
||||
BACKEND_GHIDRA,
|
||||
BACKEND_PEFILE,
|
||||
BACKEND_DRAKVUF,
|
||||
BACKEND_LANCELOT,
|
||||
BACKEND_BINEXPORT2,
|
||||
)
|
||||
from capa.helpers import (
|
||||
@@ -299,7 +299,7 @@ def install_common_args(parser, wanted=None):
|
||||
(BACKEND_BINJA, "Binary Ninja"),
|
||||
(BACKEND_DOTNET, ".NET"),
|
||||
(BACKEND_BINEXPORT2, "BinExport2"),
|
||||
(BACKEND_GHIDRA, "Ghidra"),
|
||||
(BACKEND_LANCELOT, "Lancelot"),
|
||||
(BACKEND_FREEZE, "capa freeze"),
|
||||
(BACKEND_CAPE, "CAPE"),
|
||||
(BACKEND_DRAKVUF, "DRAKVUF"),
|
||||
@@ -394,7 +394,6 @@ class ShouldExitError(Exception):
|
||||
"""raised when a main-related routine indicates the program should exit."""
|
||||
|
||||
def __init__(self, status_code: int):
|
||||
super().__init__(status_code)
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
@@ -661,9 +660,7 @@ def get_rules_from_cli(args) -> RuleSet:
|
||||
raises:
|
||||
ShouldExitError: if the program is invoked incorrectly and should exit.
|
||||
"""
|
||||
enable_cache: bool = getattr(args, "enable_cache", True)
|
||||
# this allows calling functions to easily disable rule caching, e.g., used by the rule linter to avoid
|
||||
|
||||
enable_cache: bool = True
|
||||
try:
|
||||
if capa.helpers.is_running_standalone() and args.is_default_rules:
|
||||
cache_dir = get_default_root() / "cache"
|
||||
@@ -945,7 +942,8 @@ def main(argv: Optional[list[str]] = None):
|
||||
argv = sys.argv[1:]
|
||||
|
||||
desc = "The FLARE team's open-source tool to identify capabilities in executable files."
|
||||
epilog = textwrap.dedent("""
|
||||
epilog = textwrap.dedent(
|
||||
"""
|
||||
By default, capa uses a default set of embedded rules.
|
||||
You can see the rule set here:
|
||||
https://github.com/mandiant/capa-rules
|
||||
@@ -972,7 +970,8 @@ def main(argv: Optional[list[str]] = None):
|
||||
|
||||
filter rules by meta fields, e.g. rule name or namespace
|
||||
capa -t "create TCP socket" suspicious.exe
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description=desc, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
@@ -1107,26 +1106,14 @@ def ida_main():
|
||||
|
||||
|
||||
def ghidra_main():
|
||||
from ghidra.program.flatapi import FlatProgramAPI
|
||||
|
||||
import capa.rules
|
||||
import capa.ghidra.helpers
|
||||
import capa.render.default
|
||||
import capa.features.extractors.ghidra.context
|
||||
import capa.features.extractors.ghidra.extractor
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
# These are provided by the Ghidra scripting environment
|
||||
# but are not available when running standard python
|
||||
# so we have to ignore the linting errors
|
||||
program = currentProgram # type: ignore [name-defined] # noqa: F821
|
||||
monitor_ = monitor # type: ignore [name-defined] # noqa: F821
|
||||
flat_api = FlatProgramAPI(program)
|
||||
|
||||
capa.features.extractors.ghidra.context.set_context(program, flat_api, monitor_)
|
||||
|
||||
logger.debug("-" * 80)
|
||||
logger.debug(" Using default embedded rules.")
|
||||
logger.debug(" ")
|
||||
|
||||
@@ -31,7 +31,6 @@ $ protoc.exe --python_out=. --mypy_out=. <path_to_proto> (e.g. capa/render/proto
|
||||
|
||||
Alternatively, --pyi_out=. can be used to generate a Python Interface file that supports development
|
||||
"""
|
||||
|
||||
import datetime
|
||||
from typing import Any, Union
|
||||
|
||||
|
||||
@@ -418,9 +418,8 @@ class Match(FrozenModel):
|
||||
and a.id <= location.id
|
||||
]
|
||||
)
|
||||
if matches_in_thread:
|
||||
_, most_recent_match = matches_in_thread[-1]
|
||||
children.append(Match.from_capa(rules, capabilities, most_recent_match))
|
||||
_, most_recent_match = matches_in_thread[-1]
|
||||
children.append(Match.from_capa(rules, capabilities, most_recent_match))
|
||||
|
||||
else:
|
||||
children.append(Match.from_capa(rules, capabilities, rule_matches[location]))
|
||||
@@ -479,11 +478,8 @@ class Match(FrozenModel):
|
||||
and a.id <= location.id
|
||||
]
|
||||
)
|
||||
# namespace matches may not occur within the same thread as the result, so only
|
||||
# proceed if a match within the same thread is found
|
||||
if matches_in_thread:
|
||||
_, most_recent_match = matches_in_thread[-1]
|
||||
children.append(Match.from_capa(rules, capabilities, most_recent_match))
|
||||
_, most_recent_match = matches_in_thread[-1]
|
||||
children.append(Match.from_capa(rules, capabilities, most_recent_match))
|
||||
else:
|
||||
if location in rule_matches:
|
||||
children.append(Match.from_capa(rules, capabilities, rule_matches[location]))
|
||||
|
||||
@@ -17,7 +17,6 @@ import io
|
||||
from typing import Union, Iterator, Optional
|
||||
|
||||
import rich.console
|
||||
from rich.markup import escape
|
||||
from rich.progress import Text
|
||||
|
||||
import capa.render.result_document as rd
|
||||
@@ -25,21 +24,21 @@ import capa.render.result_document as rd
|
||||
|
||||
def bold(s: str) -> Text:
|
||||
"""draw attention to the given string"""
|
||||
return Text.from_markup(f"[cyan]{escape(s)}")
|
||||
return Text.from_markup(f"[cyan]{s}")
|
||||
|
||||
|
||||
def bold2(s: str) -> Text:
|
||||
"""draw attention to the given string, within a `bold` section"""
|
||||
return Text.from_markup(f"[green]{escape(s)}")
|
||||
return Text.from_markup(f"[green]{s}")
|
||||
|
||||
|
||||
def mute(s: str) -> Text:
|
||||
"""draw attention away from the given string"""
|
||||
return Text.from_markup(f"[dim]{escape(s)}")
|
||||
return Text.from_markup(f"[dim]{s}")
|
||||
|
||||
|
||||
def warn(s: str) -> Text:
|
||||
return Text.from_markup(f"[yellow]{escape(s)}")
|
||||
return Text.from_markup(f"[yellow]{s}")
|
||||
|
||||
|
||||
def format_parts_id(data: Union[rd.AttackSpec, rd.MBCSpec]):
|
||||
|
||||
@@ -159,8 +159,9 @@ def render_call(layout: rd.DynamicLayout, addr: frz.Address) -> str:
|
||||
s.append(f"){rest}")
|
||||
|
||||
newline = "\n"
|
||||
# Use default (non-dim) styling for API details so they remain readable in -vv output
|
||||
return f"{pname}{{pid:{call.thread.process.pid},tid:{call.thread.tid},call:{call.id}}}\n{newline.join(s)}"
|
||||
return (
|
||||
f"{pname}{{pid:{call.thread.process.pid},tid:{call.thread.tid},call:{call.id}}}\n{rutils.mute(newline.join(s))}"
|
||||
)
|
||||
|
||||
|
||||
def render_short_call(layout: rd.DynamicLayout, addr: frz.Address) -> str:
|
||||
@@ -179,8 +180,7 @@ def render_short_call(layout: rd.DynamicLayout, addr: frz.Address) -> str:
|
||||
s.append(f"){rest}")
|
||||
|
||||
newline = "\n"
|
||||
# Use default (non-dim) styling for API details so they remain readable in -vv output
|
||||
return f"call:{call.id}\n{newline.join(s)}"
|
||||
return f"call:{call.id}\n{rutils.mute(newline.join(s))}"
|
||||
|
||||
|
||||
def render_static_meta(console: Console, meta: rd.StaticMetadata):
|
||||
|
||||
@@ -274,8 +274,12 @@ SUPPORTED_FEATURES[Scope.FUNCTION].update(SUPPORTED_FEATURES[Scope.BASIC_BLOCK])
|
||||
|
||||
|
||||
class InvalidRule(ValueError):
|
||||
def __init__(self, msg):
|
||||
super().__init__()
|
||||
self.msg = msg
|
||||
|
||||
def __str__(self):
|
||||
return f"invalid rule: {super().__str__()}"
|
||||
return f"invalid rule: {self.msg}"
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
@@ -285,15 +289,20 @@ class InvalidRuleWithPath(InvalidRule):
|
||||
def __init__(self, path, msg):
|
||||
super().__init__(msg)
|
||||
self.path = path
|
||||
self.msg = msg
|
||||
self.__cause__ = None
|
||||
|
||||
def __str__(self):
|
||||
return f"invalid rule: {self.path}: {super(InvalidRule, self).__str__()}"
|
||||
return f"invalid rule: {self.path}: {self.msg}"
|
||||
|
||||
|
||||
class InvalidRuleSet(ValueError):
|
||||
def __init__(self, msg):
|
||||
super().__init__()
|
||||
self.msg = msg
|
||||
|
||||
def __str__(self):
|
||||
return f"invalid rule set: {super().__str__()}"
|
||||
return f"invalid rule set: {self.msg}"
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
@@ -1093,15 +1102,15 @@ class Rule:
|
||||
@lru_cache()
|
||||
def _get_yaml_loader():
|
||||
try:
|
||||
# prefer to use CLoader to be fast, see #306 / CSafeLoader is the same as CLoader but with safe loading
|
||||
# prefer to use CLoader to be fast, see #306
|
||||
# on Linux, make sure you install libyaml-dev or similar
|
||||
# on Windows, get WHLs from pyyaml.org/pypi
|
||||
logger.debug("using libyaml CSafeLoader.")
|
||||
return yaml.CSafeLoader
|
||||
logger.debug("using libyaml CLoader.")
|
||||
return yaml.CLoader
|
||||
except Exception:
|
||||
logger.debug("unable to import libyaml CSafeLoader, falling back to Python yaml parser.")
|
||||
logger.debug("unable to import libyaml CLoader, falling back to Python yaml parser.")
|
||||
logger.debug("this will be slower to load rules.")
|
||||
return yaml.SafeLoader
|
||||
return yaml.Loader
|
||||
|
||||
@staticmethod
|
||||
def _get_ruamel_yaml_parser():
|
||||
@@ -1143,8 +1152,6 @@ class Rule:
|
||||
else:
|
||||
# use pyyaml because it can be much faster than ruamel (pure python)
|
||||
doc = yaml.load(s, Loader=cls._get_yaml_loader())
|
||||
if doc is None or not isinstance(doc, dict) or "rule" not in doc:
|
||||
raise InvalidRule("empty or invalid YAML document")
|
||||
return cls.from_dict(doc, s)
|
||||
|
||||
@classmethod
|
||||
@@ -1449,13 +1456,6 @@ class RuleSet:
|
||||
scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes
|
||||
}
|
||||
|
||||
# Pre-compute the topological index mapping for each scope.
|
||||
# This avoids rebuilding the dict on every call to _match (which runs once per
|
||||
# instruction/basic-block/function/file scope, i.e. potentially millions of times).
|
||||
self._rule_index_by_scope: dict[Scope, dict[str, int]] = {
|
||||
scope: {rule.name: i for i, rule in enumerate(self.rules_by_scope[scope])} for scope in scopes
|
||||
}
|
||||
|
||||
@property
|
||||
def file_rules(self):
|
||||
return self.rules_by_scope[Scope.FILE]
|
||||
@@ -1885,13 +1885,11 @@ class RuleSet:
|
||||
"""
|
||||
done = []
|
||||
|
||||
# use a list as a stack: append new items and pop() from the end, both O(1).
|
||||
# order doesn't matter here since every rule in the queue is processed eventually.
|
||||
rules_stack = list(rules)
|
||||
while rules_stack:
|
||||
rule = rules_stack.pop()
|
||||
# use a queue of rules, because we'll be modifying the list (appending new items) as we go.
|
||||
while rules:
|
||||
rule = rules.pop(0)
|
||||
for subscope_rule in rule.extract_subscope_rules():
|
||||
rules_stack.append(subscope_rule)
|
||||
rules.append(subscope_rule)
|
||||
done.append(rule)
|
||||
|
||||
return done
|
||||
@@ -1940,11 +1938,11 @@ class RuleSet:
|
||||
"""
|
||||
|
||||
feature_index: RuleSet._RuleFeatureIndex = self._feature_indexes_by_scopes[scope]
|
||||
rules: list[Rule] = self.rules_by_scope[scope]
|
||||
# Topologic location of rule given its name.
|
||||
# That is, rules with a lower index should be evaluated first, since their dependencies
|
||||
# will be evaluated later.
|
||||
# Pre-computed in __init__ to avoid rebuilding on every _match call.
|
||||
rule_index_by_rule_name = self._rule_index_by_scope[scope]
|
||||
rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)}
|
||||
|
||||
# This algorithm is optimized to evaluate as few rules as possible,
|
||||
# because the less work we do, the faster capa can run.
|
||||
@@ -2040,9 +2038,7 @@ class RuleSet:
|
||||
candidate_rules = [self.rules[name] for name in candidate_rule_names]
|
||||
|
||||
# Order rules topologically, so that rules with dependencies work correctly.
|
||||
# Sort descending so pop() from the end yields the topologically-first rule in O(1).
|
||||
RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
|
||||
candidate_rules.reverse()
|
||||
|
||||
#
|
||||
# The following is derived from ceng.match
|
||||
@@ -2057,7 +2053,7 @@ class RuleSet:
|
||||
augmented_features = features
|
||||
|
||||
while candidate_rules:
|
||||
rule = candidate_rules.pop()
|
||||
rule = candidate_rules.pop(0)
|
||||
res = rule.evaluate(augmented_features, short_circuit=True)
|
||||
if res:
|
||||
# we first matched the rule with short circuiting enabled.
|
||||
@@ -2096,7 +2092,6 @@ class RuleSet:
|
||||
candidate_rule_names.update(new_candidates)
|
||||
candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates])
|
||||
RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
|
||||
candidate_rules.reverse()
|
||||
|
||||
return (augmented_features, results)
|
||||
|
||||
@@ -2233,10 +2228,7 @@ def get_rules(
|
||||
|
||||
try:
|
||||
rule = Rule.from_yaml(content.decode("utf-8"))
|
||||
except InvalidRule as e:
|
||||
if e.args and e.args[0] == "empty or invalid YAML document":
|
||||
logger.warning("skipping %s: %s", path, e)
|
||||
continue
|
||||
except InvalidRule:
|
||||
raise
|
||||
else:
|
||||
rule.meta["capa/path"] = path.as_posix()
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
__version__ = "9.3.1"
|
||||
__version__ = "9.0.0"
|
||||
|
||||
|
||||
def get_major_version():
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
# mapa html-map demo
|
||||
|
||||
*2026-03-16T17:05:38Z by Showboat 0.6.1*
|
||||
<!-- showboat-id: 1cf46a16-c3d7-459d-8593-c23080cb12f6 -->
|
||||
|
||||
Generate a standalone HTML report for a sample binary and summarize the report contents.
|
||||
|
||||
```bash
|
||||
tmp=$(mktemp /tmp/mapa-html-map-XXXXXX.html)
|
||||
PYTHONWARNINGS=ignore ./.venv/bin/python -m mapa binaries/01/16/mpbindump.exe --output html-map --quiet > "$tmp"
|
||||
PYTHONWARNINGS=ignore /usr/bin/python3 - "$tmp" <<"PY"
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
text = Path(sys.argv[1]).read_text()
|
||||
match = re.search(r"<script type=\"application/json\" id=\"mapa-data\">(.*?)</script>", text, re.S)
|
||||
data = json.loads(match.group(1))
|
||||
print("doctype", text.splitlines()[0])
|
||||
print("functions", len(data["functions"]))
|
||||
print("tags", len(data["tags"]))
|
||||
print("strings", len(data["strings"]))
|
||||
PY
|
||||
rm "$tmp"
|
||||
```
|
||||
|
||||
```output
|
||||
doctype <!doctype html>
|
||||
functions 1406
|
||||
tags 12
|
||||
strings 81
|
||||
```
|
||||
|
||||
To open the report directly in your browser, use `python -m mapa <sample> --output html-map --open`.
|
||||
BIN
doc/img/ghidra_headless_analyzer.png
Normal file
BIN
doc/img/ghidra_headless_analyzer.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 210 KiB |
BIN
doc/img/ghidra_script_mngr_output.png
Executable file
BIN
doc/img/ghidra_script_mngr_output.png
Executable file
Binary file not shown.
|
After Width: | Height: | Size: 108 KiB |
BIN
doc/img/ghidra_script_mngr_rules.png
Executable file
BIN
doc/img/ghidra_script_mngr_rules.png
Executable file
Binary file not shown.
|
After Width: | Height: | Size: 110 KiB |
BIN
doc/img/ghidra_script_mngr_verbosity.png
Executable file
BIN
doc/img/ghidra_script_mngr_verbosity.png
Executable file
Binary file not shown.
|
After Width: | Height: | Size: 79 KiB |
@@ -1,100 +0,0 @@
|
||||
# mapa design
|
||||
|
||||
## Architecture
|
||||
|
||||
Seven layers, each in its own module under the `mapa/` package:
|
||||
|
||||
| Module | Responsibility |
|
||||
|---|---|
|
||||
| `mapa/model.py` | Backend-neutral dataclasses: `MapaReport`, `MapaMeta`, `MapaSection`, `MapaLibrary`, `MapaFunction`, `MapaCall`, `MapaCaller`, `MapaString`, `MapaProgramString`, `AssemblageRecord` |
|
||||
| `mapa/assemblage.py` | Assemblage CSV loading, column validation, SHA-256 filtering, RVA-to-VA conversion, exact-row deduplication |
|
||||
| `mapa/ida_db.py` | IDA database lifecycle: `resolve_database()`, `open_database_session()`, SHA-256 caching, flock-based concurrency guard |
|
||||
| `mapa/collector.py` | Populates `MapaReport` from an open `ida_domain.Database`. All IDA queries live here. |
|
||||
| `mapa/renderer.py` | Rich-based text rendering and plain single-function summary formatting from `MapaReport`. No IDA dependency. |
|
||||
| `mapa/html_renderer.py` | Self-contained `html-map` rendering from `MapaReport`. No IDA dependency. |
|
||||
| `mapa/cli.py` | Argument parsing, capa/assemblage loading, output-mode selection, `--open` temp-file/browser handling, orchestration |
|
||||
|
||||
`scripts/mapa.py` is a thin entry point that delegates to `mapa.cli.main()`.
|
||||
|
||||
The CLI validates output-mode combinations before analysis. For `--output html-map --open`, it renders the HTML once, writes it to a temporary `.html` file via `NamedTemporaryFile(delete=False)`, and opens the browser with `webbrowser.open(file://...)`.
|
||||
|
||||
## Database lifecycle
|
||||
|
||||
Modeled on `idals.py` from idawilli:
|
||||
|
||||
1. If input is `.i64`/`.idb`, use directly.
|
||||
2. Otherwise, hash the file (MD5 + SHA-256), check `~/.cache/mandiant/mapa/<sha256>.i64`.
|
||||
3. On cache miss: acquire advisory flock, create database via `Database.open()` with `IdaCommandOptions(auto_analysis=True, new_database=True, output_database=..., load_resources=True)`, wait for `ida_auto.auto_wait()`.
|
||||
4. On cache hit or after creation: open read-only with `new_database=False, save_on_close=False`.
|
||||
5. Concurrency guard: poll for `.nam` file disappearance + `fcntl.flock` on `<db>.lock` + TOCTOU re-check.
|
||||
|
||||
## Assemblage loading
|
||||
|
||||
Assemblage loading is deferred until after mapa opens the IDA database, because the effective sample SHA-256 may come from either the raw input file or the database metadata.
|
||||
|
||||
`mapa.assemblage.load_assemblage_records()`:
|
||||
|
||||
- reads the CSV with `csv.DictReader`
|
||||
- requires `hash`, `name`, `start`, `end`, and `source_file`
|
||||
- filters rows by sample SHA-256, case-insensitively
|
||||
- converts `start` and `end` RVAs to VAs by adding `db.base_address`
|
||||
- strips the trailing provenance suffix from `source_file` only at render time, via `AssemblageRecord.source_path`
|
||||
- deduplicates exact duplicate rows while preserving CSV order for distinct ambiguous matches
|
||||
|
||||
The result is `dict[int, list[AssemblageRecord]]`, keyed by function start VA.
|
||||
|
||||
## Collector design
|
||||
|
||||
The collector builds several indexes before the main function loop:
|
||||
|
||||
- import_index: `dict[int, (module, name)]` from `db.imports.get_all_imports()`
|
||||
- extern_addrs: `set[int]` from functions in XTRN segments
|
||||
- thunk_targets: `dict[int, int]` via `_resolve_thunk_target()` — follows code refs then data refs, max depth 5, single-target chains only
|
||||
- resolved_callers/callees: built by walking all non-thunk function flowcharts, resolving call targets through thunk chains, classifying as internal vs API
|
||||
|
||||
String extraction follows single data-reference chains from each instruction up to depth 10. The collector returns both the discovered string VA and the raw string value for each hit.
|
||||
|
||||
The collector stores string data in two shapes:
|
||||
|
||||
- `MapaFunction.strings` for the text report and tooltip summaries. These stay function-local and deduplicate by trimmed display value.
|
||||
- `MapaReport.program_strings` for `html-map`. These are keyed by string VA, preserve duplicate display values at different addresses, merge tags across repeated references, and track the set of referencing function addresses.
|
||||
|
||||
Assemblage data is attached per function during collection. `MapaFunction.assemblage_records` carries zero or more `AssemblageRecord` values for the function start address. The collector does not use Assemblage to rename functions, callers, or callees.
|
||||
|
||||
## ida-domain API usage
|
||||
|
||||
Primary queries used:
|
||||
|
||||
- `db.functions` — iteration, `get_at()`, `get_name()`, `get_flags()`, `get_flowchart()`
|
||||
- `db.segments.get_all()` — section enumeration
|
||||
- `db.imports.get_all_modules()`, `get_all_imports()` — library/import enumeration
|
||||
- `db.xrefs.code_refs_from_ea()`, `data_refs_from_ea()`, `calls_from_ea()` — call/thunk resolution
|
||||
- `db.instructions.is_call_instruction()`, `get_mnemonic()` — instruction classification
|
||||
- `db.heads.size()` — instruction byte size
|
||||
- `FlowChart` with `FlowChartFlags.NOEXT | FlowChartFlags.PREDS` — CFG traversal
|
||||
- `FunctionFlags.THUNK`, `FunctionFlags.LIB` — function classification
|
||||
|
||||
No legacy `ida_*` module calls are used. All queries go through `ida-domain`.
|
||||
|
||||
## Rendering
|
||||
|
||||
`mapa/renderer.py` prints the text report in function address order. For each function, it prints the IDA-derived header first and then any Assemblage annotations as `assemblage name:` and `assemblage file:` lines. When multiple distinct Assemblage rows map to one function start address, the renderer prints all of them in order.
|
||||
|
||||
The text renderer also exposes a plain single-function summary formatter used by `html-map` tooltips. The row order matches text mode: Assemblage lines, xrefs, CFG stats, capa matches, internal calls, APIs, and strings.
|
||||
|
||||
For source-file separators, the text renderer uses the first Assemblage record's normalized source path as the function's primary source path. It tracks the last seen non-empty primary path across the function list. Missing Assemblage data does not trigger a separator and does not reset that state. When a later function introduces a different primary path, the renderer prints a muted horizontal rule with `[ <path> ]` immediately before that function.
|
||||
|
||||
`mapa/html_renderer.py` renders a single self-contained HTML document. It emits a split view: a left function pane and a right string pane, both with independent scrolling. The panes are separated by a draggable vertical divider implemented with a small inline pointer-event handler. The renderer emits one square per function in address order, one program-string row per string VA in address order, tag controls with visible function counts, a small legend for border/fill/dim semantics, right-aligned visible tags in each string row, inline JSON data for tag/string memberships, a single floating tooltip, and a small inline script that handles hover and click locking.
|
||||
|
||||
## String tagging
|
||||
|
||||
Vendored Quantum Strand string databases live under `mapa/string_tags/data/` in five families: OSS/CRT libraries (gzipped JSONL), expert rules (plain JSONL), Windows API names (gzipped text), global prevalence (gzipped JSONL + binary hash files), and junk-code strings (gzipped JSONL).
|
||||
|
||||
The `mapa/string_tags/` package has three modules:
|
||||
- `model.py` — `StringTagMatch` and `StringTagResult` dataclasses
|
||||
- `loaders.py` — file-format readers using `msgspec`, `gzip`, `hashlib`, and `importlib.resources`
|
||||
- `tagger.py` — `StringTagger` class with `tag_string(raw) -> StringTagResult`, plus `load_default_tagger()` which lazily loads and caches all databases process-wide
|
||||
|
||||
The collector tags raw strings before `rstrip()` trimming. When two raw strings collapse to the same display value, their tags and match metadata are merged. `MapaString` carries `tags: tuple[str, ...]` and `tag_matches: tuple[StringTagMatch, ...]`.
|
||||
|
||||
The text renderer uses a Rich `Text`-based helper to right-align the visible tag column on `string:` rows. The HTML renderer reuses the same visible-tag policy, builds its top tag controls from those visible tags only, shows the distinct-function count for each visible tag, and renders the visible tags right-aligned in each program-string row. The visible tag policy suppresses `#common` when a more-specific tag is also present.
|
||||
@@ -1,502 +0,0 @@
|
||||
# mapa IDA/IDALib port plan
|
||||
|
||||
Goal: preserve the current `scripts/mapa.py` report while replacing the Lancelot/BinExport2 backend with IDALib. Use the `ida-domain` API for normal analysis where it cleanly maps to the needed data. Reuse the existing capa IDA backend as the semantic reference for imports, thunks, string resolution, function naming, and database bootstrap.
|
||||
|
||||
This revision adds concrete guidance from capa's existing IDA backend.
|
||||
|
||||
## Sources used
|
||||
|
||||
Primary sources:
|
||||
- `scripts/mapa.py`
|
||||
- https://github.com/HexRaysSA/ida-claude-plugins/blob/main/plugins/ida-plugin-development/skills/ida-domain-api/SKILL.md
|
||||
- `capa/features/extractors/ida/idalib.py`
|
||||
- `capa/features/extractors/ida/extractor.py`
|
||||
- `capa/features/extractors/ida/helpers.py`
|
||||
- `capa/features/extractors/ida/file.py`
|
||||
- `capa/features/extractors/ida/function.py`
|
||||
- `capa/features/extractors/ida/insn.py`
|
||||
- `capa/features/extractors/ida/basicblock.py`
|
||||
- `capa/ida/helpers.py`
|
||||
- `capa/loader.py`
|
||||
- `tests/fixtures.py`
|
||||
- `tests/test_idalib_features.py`
|
||||
- `capa/features/common.py`
|
||||
- `idals.py` from https://github.com/williballenthin/idawilli/tree/master/idals
|
||||
|
||||
Domain API references:
|
||||
- Overview: https://ida-domain.docs.hex-rays.com/llms.txt
|
||||
- Getting started: https://ida-domain.docs.hex-rays.com/getting_started/index.md
|
||||
- Examples: https://ida-domain.docs.hex-rays.com/examples/index.md
|
||||
- Database: https://ida-domain.docs.hex-rays.com/ref/database/index.md
|
||||
- Functions: https://ida-domain.docs.hex-rays.com/ref/functions/index.md
|
||||
- Flowchart: https://ida-domain.docs.hex-rays.com/ref/flowchart/index.md
|
||||
- Instructions: https://ida-domain.docs.hex-rays.com/ref/instructions/index.md
|
||||
- Xrefs: https://ida-domain.docs.hex-rays.com/ref/xrefs/index.md
|
||||
- Strings: https://ida-domain.docs.hex-rays.com/ref/strings/index.md
|
||||
- Segments: https://ida-domain.docs.hex-rays.com/ref/segments/index.md
|
||||
- Names: https://ida-domain.docs.hex-rays.com/ref/names/index.md
|
||||
- Entries: https://ida-domain.docs.hex-rays.com/ref/entries/index.md
|
||||
|
||||
## Key correction after reviewing capa
|
||||
|
||||
The current `capa/features/extractors/ida/` backend is IDALib-capable, but it is not written against `ida-domain` today. It uses the classic IDA Python surface: `idapro`, `idaapi`, `idautils`, `idc`, `ida_bytes`, `ida_funcs`, `ida_segment`, and related modules.
|
||||
|
||||
That means the correct migration strategy is not "invent a fresh IDA collector from scratch". The correct strategy is:
|
||||
- use capa's existing IDA backend as the behavioral spec and a source of proven heuristics
|
||||
- implement the new collector against `ida-domain` wherever the needed API exists cleanly
|
||||
- treat the existing legacy helpers as reference material, not as the default implementation path
|
||||
- only introduce lower-level `ida_*` calls if the implementer can point to a concrete `ida-domain` gap and document it
|
||||
|
||||
This is especially important for:
|
||||
- IDALib database bootstrap
|
||||
- import and extern enumeration
|
||||
- thunk-chain resolution
|
||||
- string/data-reference chasing
|
||||
- alternative function names from comments
|
||||
- known IDA version caveats
|
||||
|
||||
## Current mapa output that must remain stable
|
||||
|
||||
The current script renders these sections, in this order:
|
||||
- `meta`
|
||||
- `modules`
|
||||
- `sections`
|
||||
- `libraries`
|
||||
- `functions`
|
||||
|
||||
Accepted intentional change for the port: remove `modules` entirely.
|
||||
|
||||
Inside `functions`, it currently:
|
||||
- iterates functions in address order
|
||||
- prints `thunk ...` for thunk functions
|
||||
- prints `function ...` for normal functions
|
||||
- inserts source-file separators when the primary Assemblage source path changes, ignoring gaps with missing Assemblage data
|
||||
- annotates functions with Assemblage source name and file when available
|
||||
- forwards callers through thunk targets so callers of a thunk appear on the real target
|
||||
- prints `B/E/I` as basic blocks / CFG edges / instructions plus total instruction bytes
|
||||
- prints capa rule names attached to the function
|
||||
- prints `calls:` for internal non-library callees
|
||||
- prints `api:` for import/external/library callees
|
||||
- prints `string:` for referenced strings
|
||||
|
||||
That output contract should stay stable unless a deliberate change is accepted and documented.
|
||||
|
||||
## What capa already gives you
|
||||
|
||||
There are three reusable assets.
|
||||
|
||||
The first is a proven `ida-domain` database-resolution and session-opening path in `../idawilli/idals/idals.py`. That code already does the part mapa needs most: accept either a raw sample or an existing `.i64` / `.idb`, hash raw inputs, cache analyzed databases by SHA-256, and guard concurrent access.
|
||||
|
||||
The second is capa's proven IDALib bootstrap path:
|
||||
- `capa/features/extractors/ida/idalib.py`
|
||||
- `capa/loader.py`
|
||||
- `tests/fixtures.py`
|
||||
|
||||
The third is capa's proven semantic definition of the data mapa cares about:
|
||||
- function enumeration: `capa/features/extractors/ida/extractor.py`
|
||||
- segments, imports, externs: `capa/features/extractors/ida/file.py`, `helpers.py`
|
||||
- callers and function names: `capa/features/extractors/ida/function.py`
|
||||
- API calls, call targets, strings, mnemonics, offsets: `capa/features/extractors/ida/insn.py`
|
||||
- CFG/basic blocks: `capa/features/extractors/ida/basicblock.py`, `helpers.py`
|
||||
- hashes, architecture, imagebase, file-type helpers: `capa/ida/helpers.py`
|
||||
|
||||
The practical split is simple. Use `idals.py` as the model for database resolution, caching, and guarded open/close. Use capa's IDA backend as the model for analysis semantics and parity behavior. Use `ida-domain` as the primary query surface inside the collector. Do not depend on Lancelot anywhere in the new implementation.
|
||||
|
||||
## Important behavioral facts from capa's backend
|
||||
|
||||
1. IDALib bootstrap in capa is not a bare `Database.open(...)` call.
|
||||
- It uses `capa.features.extractors.ida.idalib.has_idalib()` and `load_idalib()`.
|
||||
- It then calls `idapro.open_database(..., run_auto_analysis=True, args="-Olumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0 -R")`.
|
||||
- It disables console chatter with `idapro.enable_console_messages(False)`.
|
||||
- It waits for analysis completion with `ida_auto.auto_wait()`.
|
||||
|
||||
2. Capa explicitly disables Lumina during IDALib analysis.
|
||||
- Reason documented in `capa/loader.py`: Lumina can inject bad names or overwrite debug-info names.
|
||||
- mapa should do the same unless there is a deliberate decision to trust Lumina.
|
||||
|
||||
3. Capa requests resource loading with `-R`.
|
||||
- This matters for some file-scope extraction.
|
||||
- `tests/test_idalib_features.py` notes that IDA 9.0 had resource-loading limitations under IDALib.
|
||||
|
||||
4. The existing `IdaFeatureExtractor.get_functions()` is not a direct drop-in for mapa.
|
||||
- It calls `helpers.get_functions(skip_thunks=True, skip_libs=True)`.
|
||||
- mapa must render thunk functions, so mapa needs its own full function inventory.
|
||||
|
||||
5. Capa already encodes the thunk semantics mapa needs.
|
||||
- `THUNK_CHAIN_DEPTH_DELTA` is defined in `capa/features/common.py` as `5`.
|
||||
- `capa/features/extractors/ida/insn.py:check_for_api_call()` follows code refs, then data refs, through thunk chains to resolve imports/externs.
|
||||
- `capa/features/extractors/binexport2/__init__.py:BinExport2Analysis._compute_thunks()` shows the intended "single-target thunk chain" rule: only resolve through chains with exactly one callee per thunk hop.
|
||||
|
||||
6. Capa already encodes mapa-relevant string semantics.
|
||||
- `helpers.find_data_reference_from_insn(insn, max_depth=10)` follows single data-reference chains.
|
||||
- `helpers.find_string_at(ea)` looks for C strings and works around an IDA Unicode-decoding quirk.
|
||||
- `insn.extract_insn_string_features()` and `extract_insn_bytes_features()` use that behavior.
|
||||
|
||||
7. Capa already has the import and extern logic mapa needs.
|
||||
- `helpers.get_file_imports()` enumerates import modules and normalizes names.
|
||||
- `helpers.get_file_externs()` enumerates functions from `SEG_XTRN` segments.
|
||||
- `file.extract_file_import_names()` shows how capa treats name-vs-ordinal imports.
|
||||
|
||||
8. Capa already has alternative-name logic.
|
||||
- `helpers.get_function_alternative_names()` parses comments that look like `Alternative name is 'foo'`.
|
||||
- `function.extract_function_alternative_names()` exposes them as `FunctionName` features.
|
||||
|
||||
9. Capa already has the CFG behavior mapa should match.
|
||||
- `helpers.get_function_blocks()` uses `idaapi.FlowChart(f, flags=(idaapi.FC_PREDS | idaapi.FC_NOEXT))`.
|
||||
- The `NOEXT` part matters: it avoids useless external blocks contaminating B/E/I counts.
|
||||
|
||||
10. The test suite documents real version caveats.
|
||||
- IDA 9.0 and 9.1 had some ELF symbol issues.
|
||||
- IDA 9.0 under IDALib had resource-loading limitations.
|
||||
- mapa validation should account for those when comparing outputs.
|
||||
|
||||
## Database resolution and caching pattern to copy from idals
|
||||
|
||||
`../idawilli/idals/idals.py` is the best starting point for the "raw file or existing database" problem. It already solves the user-visible behavior mapa needs.
|
||||
|
||||
Its pattern is:
|
||||
- if the input suffix is `.i64` or `.idb`, use that database directly
|
||||
- otherwise compute hashes for the raw file with `compute_file_hashes()` and use the SHA-256 as the cache key
|
||||
- store the generated database in a common cache directory, currently `~/.cache/hex-rays/idals/<sha256>.i64`
|
||||
- serialize access with `database_access_guard()`
|
||||
- detect an already-open or unpacked database by watching for the companion `.nam` file
|
||||
- use an advisory `flock` on `<db>.lock` to avoid concurrent writers
|
||||
- after acquiring the lock, re-check `.nam` to close the TOCTOU hole
|
||||
- on a cache miss, analyze the raw sample with `Database.open(..., IdaCommandOptions(auto_analysis=True, new_database=True, output_database=..., load_resources=True), save_on_close=True)`
|
||||
- after the cached database exists, open it read-only with `open_database_session(..., auto_analysis=False)` and `save_on_close=False`
|
||||
|
||||
mapa should adopt that pattern with only minor changes:
|
||||
- use the same SHA-256-keyed cache strategy
|
||||
- keep the same locking protocol
|
||||
- put the cache in a mapa-specific directory, or intentionally share the idals directory if reuse is desired
|
||||
- expose the cache location as a small helper or constant so it can be documented and tested
|
||||
- reuse the computed SHA-256 for the `meta` section instead of hashing the sample twice
|
||||
|
||||
There is one deliberate integration check to make here. `idals.py` uses `ida-domain`'s `Database.open(...)`, while capa's bootstrap path uses `idapro.open_database(...)` and disables Lumina explicitly. For mapa, prefer the `idals.py` open-and-cache pattern because it already handles the database lifecycle correctly. Then verify whether the `ida-domain` open path offers an equivalent way to suppress Lumina. If it does, use it. If it does not, decide whether that matters for mapa output or whether database creation should fall back to capa's `idapro.open_database(...)` path while cached-session opens keep the `idals.py` pattern.
|
||||
|
||||
## Recommended architecture
|
||||
|
||||
Do not port `scripts/mapa.py` by replacing each Lancelot query inline. Split it into four layers:
|
||||
- CLI and argument parsing
|
||||
- IDA bootstrap and environment setup
|
||||
- report collection
|
||||
- rendering
|
||||
|
||||
Use backend-neutral dataclasses for the report model:
|
||||
- `MapaReport`
|
||||
- `MapaMeta`
|
||||
- `MapaSection`
|
||||
- `MapaLibrary`
|
||||
- `MapaFunction`
|
||||
- `MapaCall`
|
||||
- `MapaString`
|
||||
- `AssemblageRecord`
|
||||
|
||||
The collector should have one primary data-access layer: `ida-domain` for functions, flowcharts, instructions, strings, names, segments, xrefs, and database lifecycle. Existing capa helpers remain useful as semantic references and regression oracles.
|
||||
|
||||
## Best practical strategy
|
||||
|
||||
The implementation target is an IDALib-only collector with `ida-domain` as the primary API surface.
|
||||
|
||||
Concretely:
|
||||
- use `ida-domain` for function inventory, instruction iteration, CFG stats, name lookup, segment listing, xref walking, and cached database open/create
|
||||
- use the existing capa IDA code to understand the intended semantics for imports, externs, thunk resolution, data-reference chasing, and alternative names
|
||||
- if the implementer discovers a real `ida-domain` gap, document the gap explicitly before introducing lower-level `ida_*` calls
|
||||
|
||||
That gives the next implementer a clear target: no Lancelot, no default hybrid backend, and no legacy helper dependency unless a concrete gap forces it.
|
||||
|
||||
## Concrete mapping from mapa fields to capa/backend logic
|
||||
|
||||
| mapa field/behavior | First source to consult | Recommended implementation |
|
||||
|---|---|---|
|
||||
| IDALib discovery | `capa/features/extractors/ida/idalib.py` | Reuse `has_idalib()` / `load_idalib()` logic if mapa needs to bootstrap `idapro` availability itself. |
|
||||
| resolve/open DB | `../idawilli/idals/idals.py` | Use `resolve_database()` and `open_database_session()` as the primary pattern. |
|
||||
| cache key and cache DB path | `../idawilli/idals/idals.py` | Hash raw inputs once and key cached databases by SHA-256. |
|
||||
| Lumina suppression policy | `capa/loader.py`, `tests/fixtures.py` | Carry forward capa's disable-Lumina behavior if the chosen open path supports it. |
|
||||
| sample hashes | `../idawilli/idals/idals.py`, `capa/ida/helpers.py`, `extractor.py` | Reuse the SHA-256 computed for cache lookup; prefer IDA-provided hashes when opening an existing database. |
|
||||
| image base | `capa/ida/helpers.py` | Prefer IDA imagebase helper; use Domain API only if it exposes the same value clearly. |
|
||||
| sections | `helpers.get_segments()`, `file.extract_file_section_names()` | Use `db.segments`; match capa's header-segment filtering rules if needed. |
|
||||
| import modules/functions | `helpers.get_file_imports()` | Implement with `ida-domain` if the needed import data is exposed cleanly; otherwise use this helper as the semantic reference for normalization. |
|
||||
| externs | `helpers.get_file_externs()` | Match this behavior with `ida-domain` if possible; if not, document the missing API and then fall back deliberately. |
|
||||
| function inventory | `extractor.py`, `helpers.get_functions()` | Do not use extractor's default function list because it skips thunks/libs. Build a mapa-specific inventory with `ida-domain`. |
|
||||
| callers | `function.extract_function_calls_to()` | Reproduce the same behavior with domain xrefs and compare against this helper during validation. |
|
||||
| call targets | `insn.extract_function_calls_from()` | Reproduce the same behavior with domain xrefs and compare against this helper during validation. |
|
||||
| API calls | `insn.extract_insn_api_features()` | Match the import/extern/thunk resolution semantics exposed by this function. |
|
||||
| string refs | `helpers.find_data_reference_from_insn()`, `find_string_at()` | Match the same single-ref-chain behavior and max depth `10`. |
|
||||
| function names | `function.extract_function_name()`, alternative-name helpers | Use normal name, demangled name, alternative names, and render Assemblage annotations separately without renaming the IDA function. |
|
||||
| B/E/I stats | `helpers.get_function_blocks()` | Match `PREDS | NOEXT` semantics; use domain flowchart if possible. |
|
||||
| function ordering | current `scripts/mapa.py` | Keep address order for deltas and rendering stability. |
|
||||
|
||||
## Step-by-step implementation plan
|
||||
|
||||
### 1. Freeze the current mapa output
|
||||
|
||||
Before editing code, save golden outputs from the current `scripts/mapa.py` for:
|
||||
- a sample with normal internal calls and imports
|
||||
- a sample with thunk-heavy call patterns
|
||||
- a sample with capa and Assemblage overlays
|
||||
|
||||
These are the parity targets.
|
||||
|
||||
### 2. Add `resolve_database()` and `open_database_session()` helpers
|
||||
|
||||
Base these directly on `../idawilli/idals/idals.py`.
|
||||
|
||||
`resolve_database()` should:
|
||||
- accept either a raw sample or an existing `.i64` / `.idb`
|
||||
- return existing databases unchanged
|
||||
- hash raw inputs once and use SHA-256 as the cache key
|
||||
- place cached databases under the XDG cache root in `mandiant/mapa/`, i.e. `$XDG_CACHE_HOME/mandiant/mapa/` when set, else `~/.cache/mandiant/mapa/`
|
||||
- guard cache creation with the same `.nam` + `flock` protocol from `database_access_guard()`
|
||||
- analyze cache misses with `Database.open(..., IdaCommandOptions(auto_analysis=True, new_database=True, output_database=..., load_resources=True), save_on_close=True)`
|
||||
- keep cache creation transparent in normal mode and only log cache details in verbose/debug mode
|
||||
|
||||
`open_database_session()` should:
|
||||
- use the same guard before opening the database
|
||||
- open cached or user-supplied databases with `new_database=False`
|
||||
- default to `save_on_close=False`
|
||||
- optionally run `ida_auto.auto_wait()` when `auto_analysis=True`
|
||||
|
||||
This should become mapa's primary database lifecycle.
|
||||
|
||||
Then add one capa-derived check on top: if the chosen open path can suppress Lumina, do so. If the `ida-domain` path cannot, verify whether that difference affects naming enough to justify a fallback to capa's `idapro.open_database(...)` path during cache creation.
|
||||
|
||||
### 3. Introduce a backend-neutral report model
|
||||
|
||||
Before touching the collector logic, split `scripts/mapa.py` into:
|
||||
- CLI
|
||||
- collector
|
||||
- renderer
|
||||
- input-overlay parsing for capa JSON and Assemblage CSV
|
||||
|
||||
Keep the renderer stable. The collector should return value objects only.
|
||||
|
||||
### 4. Build a mapa-specific function inventory
|
||||
|
||||
Do not use `IdaFeatureExtractor.get_functions()` as-is, because it skips thunks and library functions.
|
||||
|
||||
Instead:
|
||||
- enumerate all functions in address order with `ida-domain` if possible
|
||||
- keep flags for `is_thunk`, `is_library`, and `is_external`
|
||||
- retain enough metadata to render thunks, skip imports from the function list, and compute deltas
|
||||
|
||||
For parity, compare your inventory against:
|
||||
- `helpers.get_functions(skip_thunks=False, skip_libs=False)`
|
||||
- IDA function flags such as `FUNC_THUNK` and `FUNC_LIB`
|
||||
|
||||
### 5. Recreate import and extern logic using capa's semantics
|
||||
|
||||
For the `libraries` section and for `api:` classification, start from the behavior encoded in:
|
||||
- `helpers.get_file_imports()`
|
||||
- `helpers.get_file_externs()`
|
||||
|
||||
That behavior already handles:
|
||||
- PE imports with `__imp_` prefixes
|
||||
- ELF imports with `@@version` suffixes
|
||||
- ordinal imports
|
||||
- extern functions in `SEG_XTRN`
|
||||
|
||||
The implementation target remains `ida-domain`. The next implementer should reproduce this behavior there if the API surface is available. If a real gap appears, document the gap before introducing any fallback.
|
||||
|
||||
### 6. Implement thunk resolution with capa's exact semantics
|
||||
|
||||
Build one cached helper, for example `resolve_thunk_target(ea)`, and use it everywhere.
|
||||
|
||||
Behavior should match capa's existing semantics:
|
||||
- maximum thunk-chain depth: `THUNK_CHAIN_DEPTH_DELTA == 5`
|
||||
- follow code refs first, then data refs if needed
|
||||
- only resolve through single-target chains
|
||||
- stop on cycles, zero-target, or multi-target cases
|
||||
- allow the final resolved target to be an import or extern
|
||||
|
||||
Use two existing code paths as references:
|
||||
- `capa/features/extractors/ida/insn.py:check_for_api_call()`
|
||||
- `capa/features/extractors/binexport2/__init__.py:BinExport2Analysis._compute_thunks()`
|
||||
|
||||
This helper must drive:
|
||||
- caller forwarding
|
||||
- `calls:` lines
|
||||
- `api:` lines
|
||||
- capa match attachment when a match lands in a thunk
|
||||
|
||||
### 7. Use capa features as references, not as the collector
|
||||
|
||||
Do not build mapa by instantiating `IdaFeatureExtractor()` and aggregating capa features into the final report. That would create a hidden second backend and blur the migration target.
|
||||
|
||||
Instead, query IDA directly through `ida-domain` and use the capa feature-extraction code as a reference when the intended semantics are unclear. The implementer should compare specific results against:
|
||||
- `Characteristic("calls to")`
|
||||
- `Characteristic("calls from")`
|
||||
- `API`
|
||||
- `String`
|
||||
- `FunctionName`
|
||||
- `Mnemonic`
|
||||
|
||||
This keeps the delivered collector IDALib-only while still giving the implementer a precise oracle for parity checks.
|
||||
|
||||
### 8. Recreate callers and callees
|
||||
|
||||
Use a precomputed normalized call graph. Do not compute callers ad hoc during rendering.
|
||||
|
||||
For each non-import function:
|
||||
- walk its instructions
|
||||
- identify call or jump-to-import patterns using the same logic as `extract_insn_api_features()`
|
||||
- resolve thunk chains
|
||||
- classify the resolved target as internal or API/import/extern
|
||||
- record caller and callee relationships on resolved targets
|
||||
|
||||
For parity, verify against these capa semantics:
|
||||
- function callers: `function.extract_function_calls_to()`
|
||||
- outgoing calls: `insn.extract_function_calls_from()`
|
||||
- API calls: `insn.extract_insn_api_features()`
|
||||
|
||||
Important detail: the existing helper treats both `call` and `jmp` as API-bearing instructions in some thunk/import cases. Do not assume `call` only.
|
||||
|
||||
### 9. Recreate B/E/I with capa's CFG semantics
|
||||
|
||||
For each rendered function:
|
||||
- basic blocks: count basic blocks using the equivalent of `helpers.get_function_blocks()`
|
||||
- edges: sum successors across those blocks
|
||||
- instructions: count instructions across those blocks
|
||||
- bytes: sum instruction sizes
|
||||
|
||||
The important parity rule is the CFG construction mode:
|
||||
- match `idaapi.FlowChart(f, flags=(idaapi.FC_PREDS | idaapi.FC_NOEXT))`
|
||||
|
||||
If the Domain API flowchart differs, use it only if it can match the no-external-block behavior. Otherwise use a tiny legacy helper for block enumeration and keep everything else in the Domain API.
|
||||
|
||||
### 10. Recreate string extraction with capa's data-ref chasing
|
||||
|
||||
Do not just test `db.strings.get_at(xref.to_ea)` and stop. That will miss the semantics capa already uses.
|
||||
|
||||
Start from capa's behavior:
|
||||
- follow a single data-reference chain from the instruction, up to depth `10`
|
||||
- if the final target is a string, emit it
|
||||
- otherwise it may be bytes, not a string
|
||||
|
||||
For mapa specifically:
|
||||
- only render strings, not raw bytes
|
||||
- deduplicate by rendered string value, matching the current script
|
||||
- trim trailing whitespace the same way the current script does
|
||||
|
||||
Reference implementation:
|
||||
- `helpers.find_data_reference_from_insn()`
|
||||
- `helpers.find_string_at()`
|
||||
- `insn.extract_insn_string_features()`
|
||||
|
||||
### 11. Reuse capa's name and alternative-name semantics
|
||||
|
||||
For the function display name, use this order:
|
||||
- demangled name
|
||||
- IDA function name
|
||||
- alternative names from comments if they help and the main name is poor
|
||||
- final fallback such as `sub_{ea:x}`
|
||||
|
||||
Render Assemblage source name and source file as annotations beneath the function header. Do not mutate the database just to apply Assemblage data.
|
||||
|
||||
Reference points:
|
||||
- `function.extract_function_name()`
|
||||
- `helpers.get_function_alternative_names()`
|
||||
|
||||
### 12. Reattach capa matches by containing function
|
||||
|
||||
Keep the current capa JSON input format, but simplify the mapping logic.
|
||||
|
||||
Recommended algorithm:
|
||||
- parse the capa JSON as today
|
||||
- for each absolute match address, ask IDA for the containing function
|
||||
- if that function is a thunk, resolve it through the thunk resolver
|
||||
- attach the rule name to the resolved function start EA
|
||||
- warn when no containing function exists
|
||||
|
||||
This is simpler than the current BinExport-specific mapping and aligns better with IDA's data model.
|
||||
|
||||
### 13. Rebuild top-level sections using capa-backed semantics
|
||||
|
||||
For `meta`:
|
||||
- sample name: input path or IDA metadata
|
||||
- hashes: prefer IDA-provided hash helpers in `capa/ida/helpers.py`
|
||||
- architecture: reuse the logic in `capa/features/extractors/ida/global_.py`
|
||||
- timestamp: define explicitly, because BinExport's old field is gone
|
||||
|
||||
For `sections`:
|
||||
- use `ida-domain` segments if possible
|
||||
- match capa's `skip_header_segments` behavior if needed
|
||||
|
||||
For `libraries`:
|
||||
- use `helpers.get_file_imports()` and group/display import modules accordingly
|
||||
|
||||
For `modules`:
|
||||
- remove the section entirely as an intentional interface change
|
||||
- document the removal in the spec so future ports do not try to reintroduce BinExport-specific `module` semantics accidentally
|
||||
|
||||
### 14. Add tests using capa's existing IDALib pattern
|
||||
|
||||
Pure tests should cover:
|
||||
- Assemblage parsing and RVA-to-VA mapping
|
||||
- thunk-chain resolution
|
||||
- import/extern normalization
|
||||
- string de-duplication and trimming
|
||||
- final rendering from a prebuilt `MapaReport`
|
||||
|
||||
Integration tests should reuse the same lifecycle mapa will use in production:
|
||||
- resolve the input to an existing or cached database
|
||||
- open it through the guarded session helper
|
||||
- collect the mapa report
|
||||
- compare key functions and sections against golden outputs
|
||||
|
||||
Use `tests/test_idalib_features.py` as the reference for version-specific skips and expectations, and use `../idawilli/idals/idals.py` as the reference for database resolution and guarded open/close behavior.
|
||||
|
||||
### 15. Validate parity and document deliberate differences
|
||||
|
||||
Compare the new output against the frozen Lancelot output on the supplied samples.
|
||||
|
||||
Verify specifically:
|
||||
- function ordering
|
||||
- thunk rendering
|
||||
- thunk-forwarded callers
|
||||
- internal vs API call classification
|
||||
- libraries/imports section contents
|
||||
- string extraction
|
||||
- B/E/I counts
|
||||
- Assemblage annotations and source-file separators
|
||||
- capa attachment
|
||||
|
||||
Document every known delta. The likely ones are:
|
||||
- function discovery differences between IDA and Lancelot
|
||||
- the intentional removal of the `modules` section
|
||||
- symbol differences across IDA versions, especially ELF on older 9.x
|
||||
- resource-dependent differences on older IDALib versions
|
||||
|
||||
## Minimal implementation checklist
|
||||
|
||||
A good order of work is:
|
||||
1. freeze current mapa outputs
|
||||
2. add backend-neutral report dataclasses
|
||||
3. add `resolve_database()` and `open_database_session()` helpers modeled on `idals.py`
|
||||
4. implement the XDG cache path and quiet-by-default cache creation behavior
|
||||
5. build a full mapa function inventory that includes thunks
|
||||
6. port sections and metadata
|
||||
7. implement import/extern classification to match capa semantics
|
||||
8. implement the thunk resolver using capa's existing semantics
|
||||
9. build normalized caller/callee/API indexes
|
||||
10. port B/E/I using `PREDS | NOEXT`-equivalent CFG traversal
|
||||
11. port string extraction using capa's data-ref-chain semantics
|
||||
12. port Assemblage overlay handling
|
||||
13. port capa JSON address-to-function attachment
|
||||
14. remove the `modules` section and document the interface change
|
||||
15. compare outputs against golden references
|
||||
16. document any proven `ida-domain` gaps and any intentional differences in spec/design during implementation
|
||||
|
||||
## Resolved decisions for the implementation handoff
|
||||
|
||||
Record these in `spec.md` or `design.md` during implementation so the behavior stays stable.
|
||||
|
||||
- accepted inputs: raw binary and existing IDA databases
|
||||
- cached databases live under the XDG cache root in `mandiant/mapa/`
|
||||
- mapa may create and persist cached IDA databases automatically
|
||||
- cache creation stays quiet in normal mode and only surfaces in verbose/debug logging
|
||||
- Lumina stays disabled for now
|
||||
- `meta.ts` becomes `datetime.now()`
|
||||
- remove the `modules` section from the report
|
||||
- the implementation target is IDALib only and all Lancelot dependencies should be removed
|
||||
- assume `ida-domain` is sufficient unless the implementer can demonstrate a specific missing API; any lower-level fallback must be justified and documented
|
||||
@@ -1,169 +0,0 @@
|
||||
# mapa plan: vendor Quantum Strand string tags
|
||||
|
||||
This plan describes how to extend mapa so every rendered `string:` line can carry right-aligned tags from Quantum Strand's string databases. The implementation target is broader than the earlier draft. It should include the full set of useful database-backed tags now: open-source library tags, CRT tags, expert tags, winapi tags, global-prevalence tags, and junk-code tags. The feature is still strictly limited to database matches. It must not import Quantum Strand or FLOSS as a runtime library, and it must not pull in Quantum Strand's whole-file layout analysis, structure tagging, file offsets, encoding columns, or any other non-database context.
|
||||
|
||||
The implementor should work only in `mapa/`, `tests/`, and packaging/docs as needed. Nothing belongs under `capa/`. The sibling checkout at `../quantumstrand/` is only a research source and a place to copy vendored resources from once.
|
||||
|
||||
## What mapa should do when this lands
|
||||
|
||||
When mapa emits a string referenced by a function, the left side should stay in the current mapa style and the right side should show database-derived tags. The renderer should use Rich width calculations so the tag column stays visible and the string text is clipped first if necessary. The output should continue to be function-centric and concise.
|
||||
|
||||
A representative result looks like this:
|
||||
|
||||
```text
|
||||
string: "invalid distance code" #zlib
|
||||
string: "GetProcAddress" #winapi
|
||||
string: "!This program cannot be run in DOS mode." #common
|
||||
string: "CurrencyDispenser1" #capa
|
||||
string: "_initterm" #winapi #code-junk
|
||||
```
|
||||
|
||||
The model should retain richer match metadata than the text renderer shows. The renderer only needs visible tags. The report model should still remember which database family matched and any associated metadata that might matter later.
|
||||
|
||||
## Non-goals
|
||||
|
||||
This feature is not a Quantum Strand port. Do not bring over its file-layout tree, structure labels like `import table`, section box rendering, code-vs-data analysis, duplicate-string tagging, relocation tagging, xor-decoding tags, or hidden-string filtering. Those features solve a different problem. mapa already knows which function references a string and only needs database-backed context for that string.
|
||||
|
||||
mapa should not start suppressing strings based on database matches. Even if an upstream expert rule has `action: hide`, mapa should still render the string. Store the action in metadata if it exists, but do not use it to drop rows.
|
||||
|
||||
## Upstream resources to copy
|
||||
|
||||
All of the following resources should be vendored into mapa under a new package such as `mapa/string_tags/data/`.
|
||||
|
||||
Library and CRT databases:
|
||||
|
||||
```bash
|
||||
mkdir -p mapa/string_tags/data/oss mapa/string_tags/data/crt
|
||||
cp ../quantumstrand/floss/qs/db/data/oss/*.jsonl.gz mapa/string_tags/data/oss/
|
||||
cp ../quantumstrand/floss/qs/db/data/crt/msvc_v143.jsonl.gz mapa/string_tags/data/crt/
|
||||
```
|
||||
|
||||
This copies these library databases:
|
||||
`brotli.jsonl.gz`, `bzip2.jsonl.gz`, `cryptopp.jsonl.gz`, `curl.jsonl.gz`, `detours.jsonl.gz`, `jemalloc.jsonl.gz`, `jsoncpp.jsonl.gz`, `kcp.jsonl.gz`, `liblzma.jsonl.gz`, `libpcap.jsonl.gz`, `libsodium.jsonl.gz`, `mbedtls.jsonl.gz`, `openssl.jsonl.gz`, `sqlite3.jsonl.gz`, `tomcrypt.jsonl.gz`, `wolfssl.jsonl.gz`, `zlib.jsonl.gz`, plus the CRT database `msvc_v143.jsonl.gz`.
|
||||
|
||||
Expert, winapi, prevalence, and junk-code databases:
|
||||
|
||||
```bash
|
||||
mkdir -p mapa/string_tags/data/expert mapa/string_tags/data/winapi mapa/string_tags/data/gp
|
||||
cp ../quantumstrand/floss/qs/db/data/expert/capa.jsonl mapa/string_tags/data/expert/
|
||||
cp ../quantumstrand/floss/qs/db/data/winapi/apis.txt.gz mapa/string_tags/data/winapi/
|
||||
cp ../quantumstrand/floss/qs/db/data/winapi/dlls.txt.gz mapa/string_tags/data/winapi/
|
||||
cp ../quantumstrand/floss/qs/db/data/gp/gp.jsonl.gz mapa/string_tags/data/gp/
|
||||
cp ../quantumstrand/floss/qs/db/data/gp/cwindb-native.jsonl.gz mapa/string_tags/data/gp/
|
||||
cp ../quantumstrand/floss/qs/db/data/gp/cwindb-dotnet.jsonl.gz mapa/string_tags/data/gp/
|
||||
cp ../quantumstrand/floss/qs/db/data/gp/junk-code.jsonl.gz mapa/string_tags/data/gp/
|
||||
cp ../quantumstrand/floss/qs/db/data/gp/xaa-hashes.bin mapa/string_tags/data/gp/
|
||||
cp ../quantumstrand/floss/qs/db/data/gp/yaa-hashes.bin mapa/string_tags/data/gp/
|
||||
```
|
||||
|
||||
The implementor should also create `mapa/string_tags/SOURCES.md` and record the upstream repo path, upstream commit, copied files, and any code copied or rewritten from upstream. The research for this plan used upstream commit `73eb1541e896c065fc694ba7b01067f56871631b`.
|
||||
|
||||
## Upstream code to read before implementing
|
||||
|
||||
The useful Quantum Strand code is small. Before writing anything, read `../quantumstrand/floss/qs/db/oss.py`, `expert.py`, `gp.py`, `winapi.py`, `../quantumstrand/floss/qs/main.py`, and the tests `../quantumstrand/tests/test_oss_db.py`, `test_winapi_db.py`, `test_gp_db.py`, `test_qs.py`, and `test_qs_pma0101.py`.
|
||||
|
||||
The only part of `floss/qs/main.py` that should influence mapa design is the small tagging and Rich rendering logic. Leave the rest of that file behind.
|
||||
|
||||
## Behavior that must be preserved from Quantum Strand
|
||||
|
||||
Quantum Strand's database lookups are simple and should be preserved exactly.
|
||||
|
||||
The OSS and CRT databases are gzip-compressed JSONL files. Each line contains one `OpenSourceString` record with fields such as `string`, `library_name`, `library_version`, `file_path`, `function_name`, and `line_number`. Lookup is exact by `string`. A match emits tag `#<library_name>`. The CRT file uses `library_name: "msvc"`, so it emits `#msvc`.
|
||||
|
||||
The expert database file is plain `capa.jsonl`, not gzip-compressed despite what the readme says. Each record is an `ExpertRule` with `type`, `value`, `tag`, `action`, and descriptive metadata. Matching behavior follows `floss/qs/db/expert.py`: exact string match for `type == "string"`, substring search for `type == "substring"`, and `re.compile(rule.value).search(...)` for `type == "regex"`. A match emits `rule.tag`, which in the current vendored file is typically `#capa`.
|
||||
|
||||
The winapi database is two gzip-compressed text files. `dlls.txt.gz` is loaded into a lowercase set and matched against `string.lower()`. `apis.txt.gz` is loaded into a case-sensitive set and matched against the string verbatim. A match from either source emits `#winapi`.
|
||||
|
||||
The global-prevalence JSONL databases are `gp.jsonl.gz`, `cwindb-native.jsonl.gz`, and `cwindb-dotnet.jsonl.gz`. Quantum Strand loads them as `StringGlobalPrevalenceDatabase` and does exact string lookup. Any hit in any of those databases emits `#common`.
|
||||
|
||||
The junk-code JSONL database is `junk-code.jsonl.gz`. It has the same file format as the prevalence JSONL databases, but Quantum Strand treats it separately. Any hit emits `#code-junk`.
|
||||
|
||||
The hash databases are `xaa-hashes.bin` and `yaa-hashes.bin`. Each file is a flat sequence of 8-byte truncated MD5 digests. Quantum Strand computes `md5(string.encode("utf-8")).digest()[:8]` and checks membership in the set. A hit emits `#common`.
|
||||
|
||||
These match rules are the core of the feature. They are much more important than matching Quantum Strand's internal class names.
|
||||
|
||||
## Recommended mapa package layout
|
||||
|
||||
Add a dedicated package under `mapa/`. A good layout is `mapa/string_tags/__init__.py`, `model.py`, `loaders.py`, `tagger.py`, a `data/` subtree, and `SOURCES.md`.
|
||||
|
||||
Do not copy upstream modules verbatim unless necessary. A mapa-local rewrite is cleaner because the code is short and mapa needs a narrower API than Quantum Strand.
|
||||
|
||||
`model.py` should define two small dataclasses. `StringTagMatch` should capture one concrete match with fields like `tag`, `source_family`, `source_name`, `kind`, and optional metadata such as `library_name`, `library_version`, `file_path`, `function_name`, `line_number`, `note`, `description`, `action`, `global_count`, `encoding`, and `location`. `StringTagResult` should hold the final sorted tag tuple plus the tuple of `StringTagMatch` entries.
|
||||
|
||||
`loaders.py` should own the file-format readers. It should use `gzip`, `hashlib`, `msgspec`, and `importlib.resources`. There is no reason to invent a new parser. This project already depends on `msgspec`, which is also what Quantum Strand uses for the JSONL formats.
|
||||
|
||||
`tagger.py` should own the process-wide cached tagger. A simple shape is `load_default_tagger()` plus an object with `tag_string(raw: str) -> StringTagResult`. The tagger should lazily load and cache the vendored databases once per process.
|
||||
|
||||
## Report-model changes
|
||||
|
||||
`mapa/model.py` should be extended so a rendered mapa string can carry tags and match metadata. The minimal change is to add `tags` and `tag_matches` to `MapaString`. The existing `value` field should remain the display string. If the implementor wants to preserve the exact raw string too, add a `raw_value` field. That is worthwhile because mapa currently trims trailing whitespace before storing the string, and exact-match databases should run against the untrimmed value.
|
||||
|
||||
The most important collector rule is this: match against the raw extracted string first, derive the display string second, and deduplicate on the display string only after the database matches have been computed. If two raw strings collapse to the same display value after `rstrip()`, their tags and metadata should be merged onto the single rendered `MapaString` entry.
|
||||
|
||||
## Collector guidance
|
||||
|
||||
The collector should keep its existing string-discovery behavior. This plan does not ask the implementor to revisit how mapa follows data references or how it discovers a string in IDA. Once `collect_report()` recovers a raw string, the new tagging pipeline begins.
|
||||
|
||||
A good implementation sequence inside `mapa/collector.py` is: recover `raw_value`, call the vendored tagger on `raw_value`, compute `display_value = raw_value.rstrip()`, skip empty display values, and then either create or update the `MapaString` entry for that display value. The update path should union tag names and append only unique `StringTagMatch` values. The final `MapaString.tags` should be sorted for stable rendering and stable tests.
|
||||
|
||||
This is the one place where the current mapa behavior is most likely to cause silent misses. If the implementor tags only the trimmed string, exact-match results from Quantum Strand can be lost.
|
||||
|
||||
## Tag aggregation rules
|
||||
|
||||
The model should preserve all concrete matches, even when multiple databases emit the same visible tag. This matters most for `#common`, because a string may hit several prevalence databases and one or both hash databases. The visible tag list should deduplicate tag names, but the metadata should preserve every source that contributed.
|
||||
|
||||
The tagger should produce tags in a deterministic order. A simple stable order is alphabetical order on the tag name after aggregation. The metadata order should also be deterministic, for example by `(tag, source_family, source_name, library_name, note, value)`.
|
||||
|
||||
## Rendering guidance
|
||||
|
||||
Replace the current plain markup string for `string:` rows with a dedicated Rich `Text` builder. The implementor should read `render_string()` and related helpers in `../quantumstrand/floss/qs/main.py` and copy only the layout idea. The left side is the existing `string: "..."` text. The right side is the space-joined visible tag list. Width should come from Rich's own measurement.
|
||||
|
||||
A helper such as `Renderer.render_string_line(value: str, tags: Sequence[str]) -> Text` is sufficient. It should use `self.console.size.width - (self.indent * 2)` as the available width for the line content, build a `Text` object for the left side and another for the right side, reserve at least one separating space, and then align or truncate the left side so the right side stays visible. If the terminal is too narrow for that layout, fall back to a single-column form that still shows the tags.
|
||||
|
||||
mapa should adopt one Quantum Strand display rule because it reduces noise without hiding information: when a string has `#common` plus one or more more-specific tags, omit `#common` from the visible tag column but keep it in `tag_matches` and `MapaString.tags`. That is a rendering choice only. The underlying data should stay intact.
|
||||
|
||||
No string row should be hidden by tag policy. `#common` and `#code-junk` may be styled in a muted color. `#capa` may be highlighted. `#winapi` and library tags can use the default string-tag style unless the implementor finds a better minimal palette. The important behavior is visibility and stable alignment. Decorative styling is secondary.
|
||||
|
||||
## Recommended visible-tag policy
|
||||
|
||||
The rendered tag column should follow these rules.
|
||||
|
||||
Show all tags except `#common` when a more-specific tag is also present. Keep `#common` visible only if it is the only tag. Show `#code-junk` even when other tags are present because it communicates a different kind of context than `#common`. Show `#winapi`, `#capa`, and library tags directly. Do not invent mapa-specific aliases or rename the upstream tags.
|
||||
|
||||
This yields readable outputs such as `#winapi #code-junk`, `#capa`, `#zlib`, or `#common`. It avoids noisy combinations like `#common #winapi` on every common API name.
|
||||
|
||||
## Packaging guidance
|
||||
|
||||
If mapa needs to work from an installed package, `pyproject.toml` will need changes because it currently only packages `capa*`. The implementor should include `mapa*` packages and package data under `mapa/string_tags/data/`. The loader should use `importlib.resources.files()` so it works both from a source checkout and an installed wheel.
|
||||
|
||||
Even if packaging is deferred, the code should still use `importlib.resources` because it centralizes the resource lookup and avoids hard-coded repository-relative paths.
|
||||
|
||||
## Implementation steps for the handoff
|
||||
|
||||
The implementor should start by copying the resources, writing `mapa/string_tags/SOURCES.md`, and adding pure loader tests before touching mapa's collector or renderer. Then they should implement the small loader layer for the five upstream database families: OSS/CRT, expert, winapi, prevalence JSONL, and prevalence hash files. After that they should implement the aggregated tagger and add pure tagger tests using known literals from the vendored datasets.
|
||||
|
||||
Once the tagger is stable, they should extend `MapaString`, thread tagging through `mapa/collector.py`, and finally switch `mapa/renderer.py` to the Rich `Text`-based string-row helper. Only after all of that is working should they update packaging and installed-resource handling, because those changes are easier to verify when the core behavior already exists.
|
||||
|
||||
During implementation they should update `doc/plans/spec.md` and `doc/plans/design.md` to record the final user-visible behavior and the final module layout. The spec should say that `string:` rows may carry right-aligned database tags and should document the visible-tag policy. The design doc should say where the vendored databases live, how the loader is structured, and how the collector merges raw-string matches into deduplicated display strings.
|
||||
|
||||
## Concrete test plan
|
||||
|
||||
Most tests should avoid IDA. Start with pure loader and tagger tests. Known-good assertions from the upstream data include `"invalid distance code" -> #zlib`, `"IsolationAware function called after IsolationAwareCleanup" -> #msvc`, `"CurrencyDispenser1" -> #capa`, `"kernel32.dll" -> #winapi`, `"CreateFileA" -> #winapi`, and `"!This program cannot be run in DOS mode." -> #common`. `"_initterm"` is a useful mixed case because Quantum Strand's own tests show it as both `#winapi` and `#code-junk`.
|
||||
|
||||
Cover the expert database's three rule types: exact, substring, and regex. Add a hash-database test that emits `#common` even when the string is absent from the JSONL prevalence files. Add another case where several databases contribute the same visible tag and the metadata still records every contributing match.
|
||||
|
||||
Add renderer tests using a fixed-width Rich console. One test should show that an untagged string row still matches the old mapa format. Another should show that a tagged row keeps the tag column at the right edge. A narrow-width test should show that the string side is clipped first. Another should check that `#common` disappears from the visible tag list when a more-specific tag exists while remaining present in the underlying model.
|
||||
|
||||
Finally, add report tests in `tests/test_mapa.py` that build a small `MapaReport` directly. At least one string should carry a library tag, at least one should carry `#common`, and at least one should carry a multi-tag combination like `#winapi #code-junk`. None of these tests should require IDA.
|
||||
|
||||
## Performance and memory notes
|
||||
|
||||
Vendoring every requested database is still practical, but loading them all eagerly may have a noticeable startup cost. The tagger should therefore be cached process-wide and built lazily. Hash files should be read once into memory as sets of 8-byte digests. The string databases should be decoded once into in-memory maps. This is a good place to keep the code simple first and optimize only if startup becomes a measured problem.
|
||||
|
||||
The current compressed data footprint is modest for OSS, CRT, expert, and winapi. The prevalence family is the largest part of the set, especially the hash files. That is another reason to centralize loading and avoid repeated per-function or per-string initialization.
|
||||
|
||||
## Notes the implementor should not miss
|
||||
|
||||
`floss/qs/db/data/expert/readme.md` says the expert database is gzip-compressed, but the shipped file is plain `capa.jsonl`. Follow the code and the actual file on disk. `floss/qs/db/oss.py` includes the CRT file in `DEFAULT_PATHS`, so treat `#msvc` as part of the library tagging feature. Quantum Strand's `remove_false_positive_lib_strings()` should not be copied because its five-hit threshold is tuned for whole-file triage and fits mapa's per-function presentation poorly.
|
||||
|
||||
The main risk in this work is not the file formats. It is silent semantic drift during integration. The implementor should preserve Quantum Strand's exact query rules, tag against the raw string before trimming, keep all concrete matches in metadata, and only simplify at the renderer boundary.
|
||||
@@ -1,90 +0,0 @@
|
||||
# mapa specification
|
||||
|
||||
mapa renders either a structured text report or a self-contained HTML map of a binary's function map: metadata, sections, import libraries, and a per-function breakdown of callers, callees, API calls, strings, CFG stats, capa rule matches, and optional Assemblage ground truth.
|
||||
|
||||
## Invocation
|
||||
|
||||
```
|
||||
python scripts/mapa.py <input_file> [--capa <capa.json>] [--assemblage <functions.csv>] [--output {text,html-map}] [--open] [--verbose] [--quiet]
|
||||
```
|
||||
|
||||
`input_file` accepts raw binaries (PE, ELF), existing IDA databases (`.i64`, `.idb`), or any file IDA can analyze. For raw files, mapa automatically creates and caches an analyzed IDA database under the XDG cache root (`$XDG_CACHE_HOME/mandiant/mapa/` or `~/.cache/mandiant/mapa/`) keyed by the file's SHA-256 hash.
|
||||
|
||||
## Backend
|
||||
|
||||
IDALib only. All analysis uses `ida-domain` as the primary query API. The Lancelot/BinExport2 backend has been removed.
|
||||
|
||||
## Output modes
|
||||
|
||||
`text` is the default. It renders the existing structured terminal report to stdout.
|
||||
|
||||
`html-map` renders a single standalone HTML document to stdout. The page inlines all HTML, CSS, JavaScript, and data. It has a compact metadata summary and tag control strip at the top, then a split view below. The left pane contains the function grid and the right pane contains the program-string list.
|
||||
|
||||
The two panes scroll independently. A draggable vertical divider lets the user resize the panes horizontally. Function squares stay in function-address order and still use the naive left-to-right wrapping layout, but they now wrap within the current width of the left pane rather than the full page width.
|
||||
|
||||
Function squares are fixed small blocks laid out left-to-right and wrapped responsively within the left pane. Hovering a tag highlights matching functions by border color and dims non-matches. Clicking a tag locks or unlocks that tag selection. Hovering a string row highlights matching functions by fill color and dims non-matches. Clicking a string row locks or unlocks that string selection. When both a tag and a string are active, a function stays emphasized if it matches either one.
|
||||
|
||||
The tag strip is sorted by descending distinct-function count, then tag name, and each control shows that count. The page also shows a small legend describing border, fill, and dim states. The string list shows each string's virtual address explicitly, preserves duplicate display values at different addresses, and shows visible tags right-aligned in each row. Function hover shows a tooltip containing the same single-function mapa summary content as text mode. Top-level tag controls use only string tags. Capa rule names are not included there.
|
||||
|
||||
`--open` is only valid with `--output html-map`. In that mode, mapa writes the HTML to a temporary `.html` file, opens the user's local web browser on the corresponding `file://` URL, and does not write the HTML document to stdout.
|
||||
|
||||
The visible-tag policy is the same in both modes: hide `#common` when a more-specific tag is present, but keep it visible when it is the only tag.
|
||||
|
||||
## Report sections
|
||||
|
||||
The text report renders these sections in order:
|
||||
|
||||
1. meta — file name, SHA-256, architecture, timestamp
|
||||
2. sections — memory segments with address, permissions (rwx), and size
|
||||
3. libraries — import modules
|
||||
4. functions — per-function detail in address order
|
||||
|
||||
### Functions section
|
||||
|
||||
Each function renders as either `thunk <name> @ <address>` or `function <name> @ <address>` followed by:
|
||||
|
||||
- source-file separator — a horizontal rule inserted before a function when its primary Assemblage source path differs from the last seen non-empty source path
|
||||
- `assemblage name:` — source function name from Assemblage, when available
|
||||
- `assemblage file:` — source file path from Assemblage, when available
|
||||
- `xref:` — callers with direction arrow and function-order delta
|
||||
- `B/E/I:` — basic blocks / CFG edges / instructions (total bytes)
|
||||
- `capa:` — matched capa rule names
|
||||
- `calls:` — internal non-library callees with direction and delta
|
||||
- `api:` — import/external/library callees
|
||||
- `string:` — referenced strings (deduplicated, whitespace-trimmed), with optional right-aligned database tags
|
||||
|
||||
Thunk functions show only the header plus any Assemblage lines.
|
||||
|
||||
### Assemblage overlay
|
||||
|
||||
When `--assemblage` is provided, mapa reads a CSV file and requires these columns: `hash`, `name`, `start`, `end`, and `source_file`.
|
||||
|
||||
Assemblage matching works like this:
|
||||
|
||||
- mapa resolves the sample SHA-256 from the input file or the opened IDA database.
|
||||
- mapa keeps only CSV rows whose `hash` matches that SHA-256, case-insensitively.
|
||||
- mapa treats `start` and `end` as RVAs and adds the IDA database base address to map them to function VAs.
|
||||
- mapa does not rename functions, callers, or callees from Assemblage data. The displayed function header stays IDA-derived.
|
||||
- mapa strips the trailing provenance suffix from `source_file` before rendering, for example `C:\src\foo.c (MD5: ...)` renders as `C:\src\foo.c`.
|
||||
- Exact duplicate CSV rows are deduplicated. If multiple distinct Assemblage rows map to the same function address, mapa renders all of them in CSV order.
|
||||
- For source-file separators, mapa uses the first Assemblage record's normalized `source_file` path as the function's primary source path.
|
||||
- Missing Assemblage data does not start or end a source-file run. It does not trigger a separator and does not reset the last seen non-empty source path.
|
||||
- When a later function has a different primary source path from the last seen non-empty source path, mapa inserts a separator immediately before that function.
|
||||
|
||||
## Deliberate interface changes from the Lancelot/BinExport2 version
|
||||
|
||||
- The `modules` section has been removed. BinExport2's module concept has no IDA equivalent.
|
||||
|
||||
## Decisions
|
||||
|
||||
- 2026-03-16: Lumina disabled during database creation via `IdaCommandOptions(plugin_options="lumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0")`, matching capa's `loader.py`. The `plugin_options` field maps to IDA's `-O` switch; embedding `-O` in the value for the second option works because `build_args()` concatenates it verbatim. Resource loading enabled via `load_resources=True` (maps to `-R`).
|
||||
- 2026-03-16: Cache directory is `$XDG_CACHE_HOME/mandiant/mapa/` (or `~/.cache/mandiant/mapa/`). Separate from idals cache.
|
||||
- 2026-03-16: `meta.ts` is `datetime.now(UTC).isoformat()` — no longer sourced from BinExport2.
|
||||
- 2026-03-16: Thunk chain depth limit is 5 (matches capa's `THUNK_CHAIN_DEPTH_DELTA`).
|
||||
- 2026-03-16: CFG stats use `FlowChartFlags.NOEXT | FlowChartFlags.PREDS` to match capa's block enumeration semantics.
|
||||
- 2026-03-16: String extraction follows single data-reference chains up to depth 10, matching capa's `find_data_reference_from_insn`.
|
||||
- 2026-03-16: String rows may carry right-aligned database tags derived from vendored Quantum Strand string databases. Tags include `#<library>` (e.g. `#zlib`, `#openssl`), `#msvc`, `#capa`, `#winapi`, `#common`, and `#code-junk`. Visible tag policy: `#common` is hidden when a more-specific tag is present; `#code-junk` is always shown. Tags are matched against the raw (untrimmed) string value. The underlying model preserves all match metadata even when the renderer suppresses a visible tag.
|
||||
- 2026-03-16: Assemblage input is a CSV keyed by sample SHA-256. mapa matches rows by `hash`, converts `start`/`end` RVAs to VAs using the database base address, annotates functions with `assemblage name:` and `assemblage file:` lines, and does not override IDA-derived function names.
|
||||
- 2026-03-16: `--output html-map` uses only string tags in the top control strip, sorts them by descending distinct-function count then name, shows those counts in the controls, applies union semantics when both a tag and string selection are active, and lists program strings by string VA with explicit addresses.
|
||||
- 2026-03-16: `--output html-map` uses a split view with independently scrolling function and string panes, a draggable vertical divider, and right-aligned visible tags in each string row.
|
||||
- 2026-03-16: `--open` is valid only with `--output html-map`. It writes the HTML report to a temporary `.html` file and opens the local browser on that file instead of writing the HTML to stdout.
|
||||
@@ -7,7 +7,6 @@
|
||||
- [ ] Review changes
|
||||
- capa https://github.com/mandiant/capa/compare/\<last-release\>...master
|
||||
- capa-rules https://github.com/mandiant/capa-rules/compare/\<last-release>\...master
|
||||
- [ ] Run `$ bump-my-version bump {patch/minor/major} [--allow-dirty]` to update [capa/version.py](https://github.com/mandiant/capa/blob/master/capa/version.py) and other version files
|
||||
- [ ] Update [CHANGELOG.md](https://github.com/mandiant/capa/blob/master/CHANGELOG.md)
|
||||
- Do not forget to add a nice introduction thanking contributors
|
||||
- Remember that we need a major release if we introduce breaking changes
|
||||
@@ -37,6 +36,7 @@
|
||||
- [capa <release>...master](https://github.com/mandiant/capa/compare/<release>...master)
|
||||
- [capa-rules <release>...master](https://github.com/mandiant/capa-rules/compare/<release>...master)
|
||||
```
|
||||
- [ ] Update [capa/version.py](https://github.com/mandiant/capa/blob/master/capa/version.py)
|
||||
- [ ] Create a PR with the updated [CHANGELOG.md](https://github.com/mandiant/capa/blob/master/CHANGELOG.md) and [capa/version.py](https://github.com/mandiant/capa/blob/master/capa/version.py). Copy this checklist in the PR description.
|
||||
- [ ] Update the [homepage](https://github.com/mandiant/capa/blob/master/web/public/index.html) (i.e. What's New section)
|
||||
- [ ] After PR review, merge the PR and [create the release in GH](https://github.com/mandiant/capa/releases/new) using text from the [CHANGELOG.md](https://github.com/mandiant/capa/blob/master/CHANGELOG.md).
|
||||
|
||||
17
doc/usage.md
17
doc/usage.md
@@ -2,21 +2,6 @@
|
||||
|
||||
See `capa -h` for all supported arguments and usage examples.
|
||||
|
||||
## Ways to consume capa output
|
||||
|
||||
| Method | Output / interface | Typical use |
|
||||
|--------|--------------------|-------------|
|
||||
| **CLI** | Text (default, `-v`, `-vv`), JSON (`-j`), or other formats | Scripting, CI, one-off analysis |
|
||||
| [**IDA Pro**](https://github.com/mandiant/capa/tree/master/capa/ida/plugin) | capa Explorer plugin inside IDA | Interactive analysis with jump-to-address |
|
||||
| [**Ghidra**](https://github.com/mandiant/capa/tree/master/capa/ghidra/plugin) | capa Explorer plugin inside Ghidra | Interactive analysis with Ghidra integration |
|
||||
| [**Binary Ninja**](https://github.com/mandiant/capa/tree/master/capa/features/extractors/binja) | capa run using Binary Ninja as the analysis backend | Interactive analysis with Binary Ninja integration |
|
||||
| [**Dynamic (Sandbox)**](https://www.mandiant.com/resources/blog/dynamic-capa-executable-behavior-cape-sandbox) | capa run on dynamic sandbox report (CAPE, VMRay, etc.) | Dynamic analysis of sandbox output |
|
||||
| [**Web (capa Explorer)**](https://mandiant.github.io/capa/explorer/) | Web UI (upload JSON or load from URL) | Sharing results, viewing from VirusTotal or similar |
|
||||
|
||||
## Default vs verbose output
|
||||
|
||||
By default, capa shows only *top-level* rule matches: capabilities that are not already implied by another displayed rule. For example, if a rule "persist via Run registry key" matches and it *contains* a match for "set registry value", the default output lists only "persist via Run registry key". This keeps the default output short while still reflecting all detected capabilities at the top level. Use **`-v`** to see all rule matches, including nested ones. Use **`-vv`** for an even more detailed view that shows how each rule matched.
|
||||
|
||||
## tips and tricks
|
||||
|
||||
### only run selected rules
|
||||
@@ -26,7 +11,7 @@ For example, `capa -t william.ballenthin@mandiant.com` runs rules that reference
|
||||
|
||||
### only analyze selected functions
|
||||
Use the `--restrict-to-functions` option to extract capabilities from only a selected set of functions. This is useful for analyzing
|
||||
large functions and figuring out their capabilities and their address of occurrence; for example: PEB access, RC4 encryption, etc.
|
||||
large functions and figuring out their capabilities and their address of occurance; for example: PEB access, RC4 encryption, etc.
|
||||
|
||||
To use this, you can copy the virtual addresses from your favorite disassembler and pass them to capa as follows:
|
||||
`capa sample.exe --restrict-to-functions 0x4019C0,0x401CD0`. If you add the `-v` option then capa will extract the interesting parts of a function for you.
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
import sys
|
||||
|
||||
from mapa.cli import main
|
||||
|
||||
sys.exit(main())
|
||||
@@ -1,57 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from mapa.model import AssemblageRecord
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
REQUIRED_COLUMNS = frozenset({"hash", "name", "start", "end", "source_file"})
|
||||
|
||||
|
||||
def validate_assemblage_columns(fieldnames: list[str] | None) -> None:
|
||||
columns = set(fieldnames or [])
|
||||
missing = sorted(REQUIRED_COLUMNS - columns)
|
||||
if missing:
|
||||
raise ValueError(
|
||||
f"assemblage CSV is missing required columns: {', '.join(missing)}"
|
||||
)
|
||||
|
||||
|
||||
def load_assemblage_records(
|
||||
assemblage_path: Path,
|
||||
sample_sha256: str,
|
||||
base_address: int,
|
||||
) -> dict[int, list[AssemblageRecord]]:
|
||||
if not sample_sha256:
|
||||
raise ValueError("sample sha256 is required to load assemblage data")
|
||||
|
||||
normalized_sha256 = sample_sha256.lower()
|
||||
records_by_address: defaultdict[int, list[AssemblageRecord]] = defaultdict(list)
|
||||
seen_by_address: defaultdict[int, set[AssemblageRecord]] = defaultdict(set)
|
||||
|
||||
with assemblage_path.open("rt", encoding="utf-8", newline="") as handle:
|
||||
reader = csv.DictReader(handle)
|
||||
validate_assemblage_columns(reader.fieldnames)
|
||||
for row in reader:
|
||||
row_hash = (row.get("hash") or "").strip().lower()
|
||||
if row_hash != normalized_sha256:
|
||||
continue
|
||||
|
||||
record = AssemblageRecord.from_csv_row(row, base_address=base_address)
|
||||
seen = seen_by_address[record.address]
|
||||
if record in seen:
|
||||
continue
|
||||
seen.add(record)
|
||||
records_by_address[record.address].append(record)
|
||||
|
||||
logger.debug(
|
||||
"loaded %d assemblage records for %s from %s",
|
||||
sum(len(records) for records in records_by_address.values()),
|
||||
normalized_sha256,
|
||||
assemblage_path,
|
||||
)
|
||||
return dict(records_by_address)
|
||||
235
mapa/cli.py
235
mapa/cli.py
@@ -1,235 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
import webbrowser
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from tempfile import NamedTemporaryFile
|
||||
from typing import Callable
|
||||
|
||||
from rich.console import Console
|
||||
from rich.logging import RichHandler
|
||||
from rich.theme import Theme
|
||||
|
||||
from mapa.assemblage import load_assemblage_records
|
||||
from mapa.model import AssemblageRecord
|
||||
|
||||
logger = logging.getLogger("mapa")
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="mapa: binary function map")
|
||||
parser.add_argument(
|
||||
"input_file", type=Path, help="path to input file (binary, .i64, or .idb)"
|
||||
)
|
||||
parser.add_argument("--capa", type=Path, help="path to capa JSON results file")
|
||||
parser.add_argument("--assemblage", type=Path, help="path to Assemblage CSV file")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
choices=("text", "html-map"),
|
||||
default="text",
|
||||
help="output format",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--open",
|
||||
action="store_true",
|
||||
help="when used with --output html-map, write to a temp file and open it in a browser",
|
||||
)
|
||||
parser.add_argument("--verbose", action="store_true", help="enable verbose logging")
|
||||
parser.add_argument(
|
||||
"--quiet", action="store_true", help="disable all output but errors"
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def validate_output_options(output: str, open_report: bool) -> None:
|
||||
if open_report and output != "html-map":
|
||||
raise ValueError("--open requires --output html-map")
|
||||
|
||||
|
||||
def write_temp_html_report(content: str, directory: Path | None = None) -> Path:
|
||||
temp_dir = None if directory is None else str(directory)
|
||||
with NamedTemporaryFile(
|
||||
mode="w",
|
||||
encoding="utf-8",
|
||||
suffix=".html",
|
||||
prefix="mapa-",
|
||||
delete=False,
|
||||
dir=temp_dir,
|
||||
) as handle:
|
||||
handle.write(content)
|
||||
return Path(handle.name)
|
||||
|
||||
|
||||
def open_html_report(
|
||||
content: str,
|
||||
opener: Callable[[str], bool] | None = None,
|
||||
directory: Path | None = None,
|
||||
) -> Path:
|
||||
report_path = write_temp_html_report(content, directory=directory)
|
||||
browser_opener = webbrowser.open if opener is None else opener
|
||||
opened = browser_opener(report_path.as_uri())
|
||||
if not opened:
|
||||
raise RuntimeError(f"failed to open browser for {report_path}")
|
||||
return report_path
|
||||
|
||||
|
||||
def _load_capa_matches(
|
||||
capa_path: Path,
|
||||
thunk_targets: dict[int, int],
|
||||
) -> dict[int, set[str]]:
|
||||
"""Load capa JSON and map matches to function addresses."""
|
||||
doc = json.loads(capa_path.read_text())
|
||||
|
||||
functions_by_basic_block: dict[int, int] = {}
|
||||
for function in doc["meta"]["analysis"]["layout"]["functions"]:
|
||||
for basic_block in function["matched_basic_blocks"]:
|
||||
functions_by_basic_block[basic_block["address"]["value"]] = function[
|
||||
"address"
|
||||
]["value"]
|
||||
|
||||
matches_by_address: defaultdict[int, set[str]] = defaultdict(set)
|
||||
for rule_name, results in doc["rules"].items():
|
||||
for location, _ in results["matches"]:
|
||||
if location["type"] != "absolute":
|
||||
continue
|
||||
matches_by_address[location["value"]].add(rule_name)
|
||||
|
||||
matches_by_function: defaultdict[int, set[str]] = defaultdict(set)
|
||||
for address, matches in matches_by_address.items():
|
||||
func_addr = functions_by_basic_block.get(address, address)
|
||||
|
||||
if func_addr in thunk_targets:
|
||||
logger.debug(
|
||||
"forwarding capa matches from thunk 0x%x to 0x%x",
|
||||
func_addr,
|
||||
thunk_targets[func_addr],
|
||||
)
|
||||
func_addr = thunk_targets[func_addr]
|
||||
|
||||
matches_by_function[func_addr].update(matches)
|
||||
for match in matches:
|
||||
logger.info("capa: 0x%x: %s", func_addr, match)
|
||||
|
||||
return dict(matches_by_function)
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
if argv is None:
|
||||
argv = sys.argv[1:]
|
||||
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(args=argv)
|
||||
try:
|
||||
validate_output_options(args.output, args.open)
|
||||
except ValueError as error:
|
||||
parser.error(str(error))
|
||||
|
||||
stderr_console = Console(stderr=True)
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG
|
||||
if args.verbose
|
||||
else (logging.ERROR if args.quiet else logging.INFO),
|
||||
format="%(message)s",
|
||||
handlers=[
|
||||
RichHandler(
|
||||
console=stderr_console, show_path=False, rich_tracebacks=args.verbose
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
from mapa.collector import collect_report
|
||||
from mapa.html_renderer import render_html_map
|
||||
from mapa.ida_db import open_database_session, resolve_database
|
||||
from mapa.renderer import render_report
|
||||
|
||||
t0 = time.time()
|
||||
db_path, md5, sha256 = resolve_database(args.input_file)
|
||||
logger.debug("perf: resolve_database: %0.2fs", time.time() - t0)
|
||||
|
||||
theme = Theme(
|
||||
{
|
||||
"decoration": "grey54",
|
||||
"title": "yellow",
|
||||
"key": "blue",
|
||||
"value": "blue",
|
||||
"default": "blue",
|
||||
},
|
||||
inherit=False,
|
||||
)
|
||||
console = Console(theme=theme, markup=False, emoji=False)
|
||||
|
||||
t0 = time.time()
|
||||
with open_database_session(db_path) as db:
|
||||
logger.debug("perf: open_database: %0.2fs", time.time() - t0)
|
||||
|
||||
base_address = db.base_address or 0
|
||||
effective_sha256 = sha256 or db.sha256 or ""
|
||||
|
||||
assemblage_records_by_address: dict[int, list[AssemblageRecord]] = {}
|
||||
if args.assemblage:
|
||||
assemblage_records_by_address = load_assemblage_records(
|
||||
args.assemblage,
|
||||
sample_sha256=effective_sha256,
|
||||
base_address=base_address,
|
||||
)
|
||||
|
||||
matches_by_function: dict[int, set[str]] = {}
|
||||
if args.capa:
|
||||
from ida_domain.functions import FunctionFlags
|
||||
|
||||
from mapa.collector import (
|
||||
_build_extern_index,
|
||||
_build_import_index,
|
||||
_resolve_thunk_target,
|
||||
)
|
||||
|
||||
import_index = _build_import_index(db)
|
||||
extern_addrs = _build_extern_index(db)
|
||||
|
||||
thunk_targets: dict[int, int] = {}
|
||||
for func in db.functions:
|
||||
flags = db.functions.get_flags(func)
|
||||
if flags and FunctionFlags.THUNK in flags:
|
||||
target = _resolve_thunk_target(
|
||||
db, int(func.start_ea), import_index, extern_addrs
|
||||
)
|
||||
if target is not None:
|
||||
thunk_targets[int(func.start_ea)] = target
|
||||
|
||||
matches_by_function = _load_capa_matches(
|
||||
args.capa,
|
||||
thunk_targets,
|
||||
)
|
||||
|
||||
t0 = time.time()
|
||||
report = collect_report(
|
||||
db,
|
||||
md5=md5,
|
||||
sha256=effective_sha256,
|
||||
matches_by_function=matches_by_function,
|
||||
assemblage_records_by_address=assemblage_records_by_address,
|
||||
)
|
||||
logger.debug("perf: collect_report: %0.2fs", time.time() - t0)
|
||||
|
||||
t0 = time.time()
|
||||
if args.output == "html-map":
|
||||
html = render_html_map(report)
|
||||
if args.open:
|
||||
report_path = open_html_report(html)
|
||||
logger.info("opened html map: %s", report_path)
|
||||
else:
|
||||
sys.stdout.write(html)
|
||||
else:
|
||||
render_report(report, console)
|
||||
logger.debug("perf: render_report: %0.2fs", time.time() - t0)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -1,492 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from ida_domain.database import Database
|
||||
from ida_domain.flowchart import FlowChartFlags
|
||||
from ida_domain.functions import FunctionFlags
|
||||
|
||||
from mapa.model import (
|
||||
AssemblageRecord,
|
||||
MapaCall,
|
||||
MapaCaller,
|
||||
MapaFunction,
|
||||
MapaLibrary,
|
||||
MapaMeta,
|
||||
MapaProgramString,
|
||||
MapaReport,
|
||||
MapaSection,
|
||||
MapaString,
|
||||
)
|
||||
from mapa.strings import (
|
||||
MAX_STRING_READ,
|
||||
extract_ascii_from_buf,
|
||||
extract_utf16le_from_buf,
|
||||
)
|
||||
from mapa.string_tags.tagger import StringTagger, load_default_tagger
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
THUNK_CHAIN_DEPTH_DELTA = 5
|
||||
|
||||
|
||||
def _get_permissions_string(perm: int) -> str:
|
||||
return (
|
||||
("r" if perm & 4 else "-")
|
||||
+ ("w" if perm & 2 else "-")
|
||||
+ ("x" if perm & 1 else "-")
|
||||
)
|
||||
|
||||
|
||||
def _collect_meta(db: Database, md5: str, sha256: str) -> MapaMeta:
|
||||
name = db.path or ""
|
||||
if not md5 and db.md5:
|
||||
md5 = db.md5
|
||||
if not sha256 and db.sha256:
|
||||
sha256 = db.sha256
|
||||
arch = db.architecture or ""
|
||||
base_address = db.base_address or 0
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
return MapaMeta(
|
||||
name=name,
|
||||
sha256=sha256,
|
||||
md5=md5,
|
||||
arch=arch,
|
||||
timestamp=timestamp,
|
||||
base_address=base_address,
|
||||
)
|
||||
|
||||
|
||||
def _collect_sections(db: Database) -> list[MapaSection]:
|
||||
sections: list[MapaSection] = []
|
||||
for seg in db.segments.get_all():
|
||||
sections.append(
|
||||
MapaSection(
|
||||
address=int(seg.start_ea),
|
||||
size=int(seg.end_ea) - int(seg.start_ea),
|
||||
perms=_get_permissions_string(int(seg.perm)),
|
||||
name=db.segments.get_name(seg) or "",
|
||||
)
|
||||
)
|
||||
return sections
|
||||
|
||||
|
||||
def _normalize_module_name(name: str) -> str:
|
||||
"""Normalize an import module name to include extension.
|
||||
|
||||
IDA strips .dll from PE import module names (e.g. 'KERNEL32' instead of
|
||||
'KERNEL32.dll'). Add it back when the name has no extension.
|
||||
"""
|
||||
if "." not in name:
|
||||
return f"{name}.dll".lower()
|
||||
return name.lower()
|
||||
|
||||
|
||||
def _collect_libraries(db: Database) -> list[MapaLibrary]:
|
||||
libraries: list[MapaLibrary] = []
|
||||
for module in db.imports.get_all_modules():
|
||||
libraries.append(MapaLibrary(name=_normalize_module_name(module.name)))
|
||||
return libraries
|
||||
|
||||
|
||||
def _build_import_index(db: Database) -> dict[int, tuple[str, str]]:
|
||||
"""Build address -> (module, function_name) for all imports."""
|
||||
imports: dict[int, tuple[str, str]] = {}
|
||||
for imp in db.imports.get_all_imports():
|
||||
name = imp.name or f"ord{imp.ordinal}"
|
||||
imports[int(imp.address)] = (_normalize_module_name(imp.module_name), name)
|
||||
return imports
|
||||
|
||||
|
||||
def _build_extern_index(db: Database) -> set[int]:
|
||||
"""Collect addresses in XTRN segments."""
|
||||
externs: set[int] = set()
|
||||
for seg in db.segments.get_all():
|
||||
seg_class = db.segments.get_class(seg)
|
||||
if seg_class and seg_class.upper() == "XTRN":
|
||||
for func in db.functions.get_between(int(seg.start_ea), int(seg.end_ea)):
|
||||
externs.add(int(func.start_ea))
|
||||
return externs
|
||||
|
||||
|
||||
def _resolve_thunk_target(
|
||||
db: Database,
|
||||
ea: int,
|
||||
import_index: dict[int, tuple[str, str]],
|
||||
extern_addrs: set[int],
|
||||
) -> int | None:
|
||||
"""Follow thunk chains up to THUNK_CHAIN_DEPTH_DELTA hops.
|
||||
|
||||
Returns the final resolved address, or None if resolution fails.
|
||||
"""
|
||||
current = ea
|
||||
for _ in range(THUNK_CHAIN_DEPTH_DELTA):
|
||||
code_refs = list(db.xrefs.code_refs_from_ea(current, flow=False))
|
||||
if len(code_refs) == 1:
|
||||
target = int(code_refs[0])
|
||||
if target in import_index or target in extern_addrs:
|
||||
return target
|
||||
target_func = db.functions.get_at(target)
|
||||
if target_func:
|
||||
flags = db.functions.get_flags(target_func)
|
||||
if flags and FunctionFlags.THUNK in flags:
|
||||
current = target
|
||||
continue
|
||||
return target
|
||||
|
||||
data_refs = list(db.xrefs.data_refs_from_ea(current))
|
||||
if len(data_refs) == 1:
|
||||
target = int(data_refs[0])
|
||||
if target in import_index or target in extern_addrs:
|
||||
return target
|
||||
target_func = db.functions.get_at(target)
|
||||
if target_func:
|
||||
flags = db.functions.get_flags(target_func)
|
||||
if flags and FunctionFlags.THUNK in flags:
|
||||
current = target
|
||||
continue
|
||||
return target
|
||||
|
||||
break
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _find_string_at(db: Database, ea: int) -> str | None:
|
||||
"""Read bytes at the given address and check for ASCII or UTF-16 LE string."""
|
||||
try:
|
||||
buf = db.bytes.get_bytes_at(ea, MAX_STRING_READ)
|
||||
except Exception:
|
||||
return None
|
||||
if not buf:
|
||||
return None
|
||||
result = extract_ascii_from_buf(buf)
|
||||
if result is not None:
|
||||
return result
|
||||
return extract_utf16le_from_buf(buf)
|
||||
|
||||
|
||||
def _find_data_reference_string(
|
||||
db: Database, insn_ea: int, max_depth: int = 10
|
||||
) -> tuple[int, str] | None:
|
||||
"""Follow single data-reference chains from an instruction to find a string."""
|
||||
current = insn_ea
|
||||
for _ in range(max_depth):
|
||||
try:
|
||||
data_refs = list(db.xrefs.data_refs_from_ea(current))
|
||||
except Exception:
|
||||
break
|
||||
if len(data_refs) != 1:
|
||||
break
|
||||
target = int(data_refs[0])
|
||||
if not db.is_valid_ea(target):
|
||||
break
|
||||
result = _find_string_at(db, target)
|
||||
if result is not None:
|
||||
return target, result
|
||||
current = target
|
||||
return None
|
||||
|
||||
|
||||
def _merge_string_metadata(
|
||||
tags: tuple[str, ...],
|
||||
tag_matches: tuple,
|
||||
new_tags: tuple[str, ...],
|
||||
new_tag_matches: tuple,
|
||||
) -> tuple[tuple[str, ...], tuple]:
|
||||
merged_tags = tuple(sorted(set(tags) | set(new_tags)))
|
||||
seen_match_keys = {match.sort_key for match in tag_matches}
|
||||
unique_new = tuple(
|
||||
match for match in new_tag_matches if match.sort_key not in seen_match_keys
|
||||
)
|
||||
return merged_tags, tag_matches + unique_new
|
||||
|
||||
|
||||
def collect_report(
|
||||
db: Database,
|
||||
md5: str = "",
|
||||
sha256: str = "",
|
||||
matches_by_function: dict[int, set[str]] | None = None,
|
||||
assemblage_records_by_address: dict[int, list[AssemblageRecord]] | None = None,
|
||||
tagger: StringTagger | None = None,
|
||||
) -> MapaReport:
|
||||
"""Collect a complete mapa report from an open IDA database."""
|
||||
if matches_by_function is None:
|
||||
matches_by_function = {}
|
||||
if assemblage_records_by_address is None:
|
||||
assemblage_records_by_address = {}
|
||||
if tagger is None:
|
||||
tagger = load_default_tagger()
|
||||
|
||||
meta = _collect_meta(db, md5, sha256)
|
||||
sections = _collect_sections(db)
|
||||
libraries = _collect_libraries(db)
|
||||
import_index = _build_import_index(db)
|
||||
extern_addrs = _build_extern_index(db)
|
||||
|
||||
all_functions: list[tuple[int, object, bool, bool]] = []
|
||||
for func in db.functions:
|
||||
ea = int(func.start_ea)
|
||||
flags = db.functions.get_flags(func)
|
||||
is_thunk = flags is not None and FunctionFlags.THUNK in flags
|
||||
is_lib = flags is not None and FunctionFlags.LIB in flags
|
||||
all_functions.append((ea, func, is_thunk, is_lib))
|
||||
|
||||
all_functions.sort(key=lambda x: x[0])
|
||||
|
||||
func_address_to_order: dict[int, int] = {}
|
||||
for i, (ea, _, _, _) in enumerate(all_functions):
|
||||
func_address_to_order[ea] = i
|
||||
|
||||
thunk_targets: dict[int, int] = {}
|
||||
for ea, func, is_thunk, _ in all_functions:
|
||||
if is_thunk:
|
||||
target = _resolve_thunk_target(db, ea, import_index, extern_addrs)
|
||||
if target is not None:
|
||||
thunk_targets[ea] = target
|
||||
|
||||
resolved_callers: dict[int, set[int]] = {}
|
||||
resolved_callees: dict[int, list[tuple[int, bool]]] = {}
|
||||
|
||||
for ea, func, is_thunk, is_lib in all_functions:
|
||||
if is_thunk or ea in import_index or ea in extern_addrs:
|
||||
continue
|
||||
|
||||
fc = db.functions.get_flowchart(
|
||||
func, flags=FlowChartFlags.NOEXT | FlowChartFlags.PREDS
|
||||
)
|
||||
if fc is None:
|
||||
continue
|
||||
|
||||
seen_callees: set[int] = set()
|
||||
callees: list[tuple[int, bool]] = []
|
||||
|
||||
for block in fc:
|
||||
insns = block.get_instructions()
|
||||
if insns is None:
|
||||
continue
|
||||
for insn in insns:
|
||||
if not db.instructions.is_call_instruction(insn):
|
||||
# also check for jumps to imports (thunk pattern)
|
||||
mnem = db.instructions.get_mnemonic(insn)
|
||||
if mnem and mnem.lower().startswith("jmp"):
|
||||
call_targets = list(
|
||||
db.xrefs.code_refs_from_ea(int(insn.ea), flow=False)
|
||||
)
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
call_targets = list(db.xrefs.calls_from_ea(int(insn.ea)))
|
||||
if not call_targets:
|
||||
call_targets = list(
|
||||
db.xrefs.code_refs_from_ea(int(insn.ea), flow=False)
|
||||
)
|
||||
|
||||
for target_ea in call_targets:
|
||||
target_ea = int(target_ea)
|
||||
resolved_target = target_ea
|
||||
|
||||
if target_ea in thunk_targets:
|
||||
resolved_target = thunk_targets[target_ea]
|
||||
|
||||
if resolved_target in seen_callees:
|
||||
continue
|
||||
seen_callees.add(resolved_target)
|
||||
|
||||
is_api = (
|
||||
resolved_target in import_index
|
||||
or resolved_target in extern_addrs
|
||||
)
|
||||
callees.append((resolved_target, is_api))
|
||||
|
||||
if resolved_target not in resolved_callers:
|
||||
resolved_callers[resolved_target] = set()
|
||||
resolved_callers[resolved_target].add(ea)
|
||||
|
||||
resolved_callees[ea] = callees
|
||||
|
||||
mapa_functions: list[MapaFunction] = []
|
||||
program_strings_by_address: dict[int, MapaProgramString] = {}
|
||||
for ea, func, is_thunk, is_lib in all_functions:
|
||||
if ea in import_index or ea in extern_addrs:
|
||||
continue
|
||||
|
||||
name = db.functions.get_name(func) or f"sub_{ea:x}"
|
||||
|
||||
order = func_address_to_order[ea]
|
||||
|
||||
mf = MapaFunction(
|
||||
address=ea,
|
||||
name=name,
|
||||
is_thunk=is_thunk,
|
||||
is_library=is_lib,
|
||||
assemblage_records=list(assemblage_records_by_address.get(ea, [])),
|
||||
)
|
||||
|
||||
if is_thunk:
|
||||
mapa_functions.append(mf)
|
||||
continue
|
||||
|
||||
fc = db.functions.get_flowchart(
|
||||
func, flags=FlowChartFlags.NOEXT | FlowChartFlags.PREDS
|
||||
)
|
||||
if fc is not None:
|
||||
num_blocks = 0
|
||||
num_edges = 0
|
||||
num_insns = 0
|
||||
total_bytes = 0
|
||||
|
||||
for block in fc:
|
||||
num_blocks += 1
|
||||
num_edges += block.count_successors()
|
||||
insns = block.get_instructions()
|
||||
if insns is None:
|
||||
continue
|
||||
for insn in insns:
|
||||
num_insns += 1
|
||||
insn_size = db.heads.size(int(insn.ea))
|
||||
total_bytes += insn_size
|
||||
|
||||
mf.num_basic_blocks = num_blocks
|
||||
mf.num_edges = num_edges
|
||||
mf.num_instructions = num_insns
|
||||
mf.total_instruction_bytes = total_bytes
|
||||
|
||||
for caller_ea in sorted(resolved_callers.get(ea, set())):
|
||||
if caller_ea not in func_address_to_order:
|
||||
continue
|
||||
caller_order = func_address_to_order[caller_ea]
|
||||
delta = caller_order - order
|
||||
direction = "↑" if delta < 0 else "↓"
|
||||
caller_func = db.functions.get_at(caller_ea)
|
||||
caller_name = (
|
||||
db.functions.get_name(caller_func)
|
||||
if caller_func
|
||||
else f"sub_{caller_ea:x}"
|
||||
)
|
||||
mf.callers.append(
|
||||
MapaCaller(
|
||||
name=caller_name or f"sub_{caller_ea:x}",
|
||||
address=caller_ea,
|
||||
delta=delta,
|
||||
direction=direction,
|
||||
)
|
||||
)
|
||||
|
||||
for target_ea, is_api in resolved_callees.get(ea, []):
|
||||
if is_api:
|
||||
if target_ea in import_index:
|
||||
module_name, func_name = import_index[target_ea]
|
||||
api_name = f"{module_name}!{func_name}"
|
||||
else:
|
||||
target_func = db.functions.get_at(target_ea)
|
||||
api_name = (
|
||||
db.functions.get_name(target_func)
|
||||
if target_func
|
||||
else f"sub_{target_ea:x}"
|
||||
)
|
||||
api_name = api_name or f"sub_{target_ea:x}"
|
||||
mf.apis.append(
|
||||
MapaCall(
|
||||
name=api_name,
|
||||
address=target_ea,
|
||||
is_api=True,
|
||||
)
|
||||
)
|
||||
else:
|
||||
if target_ea not in func_address_to_order:
|
||||
continue
|
||||
target_order = func_address_to_order[target_ea]
|
||||
delta = target_order - order
|
||||
direction = "↑" if delta < 0 else "↓"
|
||||
target_func = db.functions.get_at(target_ea)
|
||||
target_name = (
|
||||
db.functions.get_name(target_func)
|
||||
if target_func
|
||||
else f"sub_{target_ea:x}"
|
||||
)
|
||||
mf.calls.append(
|
||||
MapaCall(
|
||||
name=target_name or f"sub_{target_ea:x}",
|
||||
address=target_ea,
|
||||
is_api=False,
|
||||
delta=delta,
|
||||
direction=direction,
|
||||
)
|
||||
)
|
||||
|
||||
if fc is not None:
|
||||
seen_strings: dict[str, MapaString] = {}
|
||||
fc2 = db.functions.get_flowchart(
|
||||
func, flags=FlowChartFlags.NOEXT | FlowChartFlags.PREDS
|
||||
)
|
||||
if fc2 is not None:
|
||||
for block in fc2:
|
||||
insns = block.get_instructions()
|
||||
if insns is None:
|
||||
continue
|
||||
for insn in insns:
|
||||
string_result = _find_data_reference_string(db, int(insn.ea))
|
||||
if string_result is None:
|
||||
continue
|
||||
string_ea, raw = string_result
|
||||
tag_result = tagger.tag_string(raw)
|
||||
display = raw.rstrip()
|
||||
if not display:
|
||||
continue
|
||||
if display in seen_strings:
|
||||
existing = seen_strings[display]
|
||||
existing.tags, existing.tag_matches = _merge_string_metadata(
|
||||
existing.tags,
|
||||
existing.tag_matches,
|
||||
tag_result.tags,
|
||||
tag_result.matches,
|
||||
)
|
||||
existing.address = min(existing.address, string_ea)
|
||||
else:
|
||||
ms = MapaString(
|
||||
value=display,
|
||||
address=string_ea,
|
||||
tags=tuple(sorted(set(tag_result.tags))),
|
||||
tag_matches=tag_result.matches,
|
||||
)
|
||||
seen_strings[display] = ms
|
||||
mf.strings.append(ms)
|
||||
|
||||
if string_ea in program_strings_by_address:
|
||||
existing_program_string = program_strings_by_address[string_ea]
|
||||
existing_program_string.tags, existing_program_string.tag_matches = _merge_string_metadata(
|
||||
existing_program_string.tags,
|
||||
existing_program_string.tag_matches,
|
||||
tag_result.tags,
|
||||
tag_result.matches,
|
||||
)
|
||||
existing_program_string.function_addresses = tuple(
|
||||
sorted(
|
||||
set(existing_program_string.function_addresses)
|
||||
| {ea}
|
||||
)
|
||||
)
|
||||
else:
|
||||
program_strings_by_address[string_ea] = MapaProgramString(
|
||||
value=display,
|
||||
address=string_ea,
|
||||
tags=tuple(sorted(set(tag_result.tags))),
|
||||
tag_matches=tag_result.matches,
|
||||
function_addresses=(ea,),
|
||||
)
|
||||
|
||||
mf.capa_matches = sorted(matches_by_function.get(ea, set()))
|
||||
mapa_functions.append(mf)
|
||||
|
||||
return MapaReport(
|
||||
meta=meta,
|
||||
sections=sections,
|
||||
libraries=libraries,
|
||||
functions=mapa_functions,
|
||||
program_strings=sorted(
|
||||
program_strings_by_address.values(),
|
||||
key=lambda string: string.address,
|
||||
),
|
||||
)
|
||||
@@ -1,306 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from html import escape
|
||||
|
||||
from mapa.model import MapaProgramString, MapaReport
|
||||
from mapa.renderer import _visible_tags, render_function_summary_text
|
||||
|
||||
|
||||
def _to_json(value: object) -> str:
|
||||
return json.dumps(value, separators=(",", ":")).replace("</", "<\\/")
|
||||
|
||||
|
||||
def _collect_tag_entries(report: MapaReport) -> list[tuple[str, list[int]]]:
|
||||
function_index_by_address = {
|
||||
function.address: index for index, function in enumerate(report.functions)
|
||||
}
|
||||
tag_to_functions: dict[str, set[int]] = {}
|
||||
for program_string in report.program_strings:
|
||||
visible_tags = _visible_tags(program_string.tags)
|
||||
if not visible_tags:
|
||||
continue
|
||||
function_indices = {
|
||||
function_index_by_address[address]
|
||||
for address in program_string.function_addresses
|
||||
if address in function_index_by_address
|
||||
}
|
||||
if not function_indices:
|
||||
continue
|
||||
for tag in visible_tags:
|
||||
tag_to_functions.setdefault(tag, set()).update(function_indices)
|
||||
|
||||
return sorted(
|
||||
(
|
||||
(tag, sorted(function_indices))
|
||||
for tag, function_indices in tag_to_functions.items()
|
||||
),
|
||||
key=lambda item: (-len(item[1]), item[0]),
|
||||
)
|
||||
|
||||
|
||||
def _render_string_row(program_string: MapaProgramString, index: int) -> str:
|
||||
visible_tags = _visible_tags(program_string.tags)
|
||||
tag_text = " ".join(visible_tags)
|
||||
tag_span = ""
|
||||
if tag_text:
|
||||
tag_span = (
|
||||
f'<span class="string-tags">{escape(tag_text)}</span>'
|
||||
)
|
||||
|
||||
return (
|
||||
f'<button type="button" class="string-row" data-string-index="{index}" '
|
||||
f'data-string-address="{escape(hex(program_string.address), quote=True)}" '
|
||||
f'data-string-value="{escape(program_string.value, quote=True)}" '
|
||||
f'data-string-tags="{escape(tag_text, quote=True)}">'
|
||||
f'<span class="string-address">{escape(hex(program_string.address))}</span>'
|
||||
f'<span class="string-value">{escape(program_string.value)}</span>'
|
||||
f"{tag_span}"
|
||||
"</button>"
|
||||
)
|
||||
|
||||
|
||||
def render_html_map(report: MapaReport) -> str:
|
||||
tag_entries = _collect_tag_entries(report)
|
||||
function_index_by_address = {
|
||||
function.address: index for index, function in enumerate(report.functions)
|
||||
}
|
||||
program_strings = sorted(report.program_strings, key=lambda string: string.address)
|
||||
|
||||
data = {
|
||||
"functions": [
|
||||
render_function_summary_text(function) for function in report.functions
|
||||
],
|
||||
"tags": {tag: function_indices for tag, function_indices in tag_entries},
|
||||
"strings": [
|
||||
{
|
||||
"address": hex(program_string.address),
|
||||
"value": program_string.value,
|
||||
"functionIndices": [
|
||||
function_index_by_address[address]
|
||||
for address in program_string.function_addresses
|
||||
if address in function_index_by_address
|
||||
],
|
||||
}
|
||||
for program_string in program_strings
|
||||
],
|
||||
}
|
||||
|
||||
parts: list[str] = [
|
||||
"<!doctype html>",
|
||||
'<html lang="en">',
|
||||
"<head>",
|
||||
'<meta charset="utf-8">',
|
||||
f"<title>{escape(report.meta.name)} - mapa html map</title>",
|
||||
"<style>",
|
||||
"*,*::before,*::after{box-sizing:border-box;margin:0;padding:0}",
|
||||
"html,body{height:100%}",
|
||||
":root{--bg:#fff;--fg:#111;--muted:#666;--line:#cfcfcf;--fill:#d9d9d9;--tag:#2563eb;--string:#93c5fd;--square:10px}",
|
||||
"body{height:100vh;overflow:hidden;background:var(--bg);color:var(--fg);font:13px/1.4 ui-monospace,SFMono-Regular,Menlo,Consolas,monospace;padding:16px}",
|
||||
"body.is-resizing{cursor:col-resize}",
|
||||
"body.is-resizing *{user-select:none}",
|
||||
"main{height:100%;min-height:0;display:flex;flex-direction:column;gap:16px}",
|
||||
"section{display:flex;flex-direction:column;gap:8px}",
|
||||
".meta,.legend,.pane-header{color:var(--muted)}",
|
||||
".controls{display:flex;flex-wrap:wrap;gap:6px}",
|
||||
".control{border:1px solid var(--line);background:transparent;color:inherit;padding:2px 6px;font:inherit;cursor:pointer}",
|
||||
".control.is-active{border-color:var(--tag);color:var(--tag)}",
|
||||
".control-count{color:var(--muted)}",
|
||||
".split-view{flex:1;min-height:0;display:flex;align-items:stretch}",
|
||||
".pane{min-height:0;display:flex;flex-direction:column;overflow:auto;border:1px solid var(--line);background:var(--bg)}",
|
||||
".pane-functions{flex:0 0 50%;min-width:18rem}",
|
||||
".pane-strings{flex:1 1 auto;min-width:18rem}",
|
||||
".pane-header{position:sticky;top:0;z-index:1;background:var(--bg);border-bottom:1px solid var(--line);padding:8px 10px}",
|
||||
".pane-body{display:flex;flex-direction:column;gap:8px;padding:8px 10px}",
|
||||
".splitter{position:relative;flex:0 0 12px;cursor:col-resize;touch-action:none}",
|
||||
".splitter::before{content:'';position:absolute;top:0;bottom:0;left:50%;width:1px;background:var(--line);transform:translateX(-50%)}",
|
||||
".splitter::after{content:'';position:absolute;top:50%;left:50%;width:3px;height:40px;border-left:1px solid var(--line);border-right:1px solid var(--line);transform:translate(-50%,-50%)}",
|
||||
".function-grid{display:flex;flex-wrap:wrap;gap:1px;align-content:flex-start}",
|
||||
".function-box{width:var(--square);height:var(--square);border:1px solid var(--line);background:var(--fill)}",
|
||||
".function-box.is-tag{border-color:var(--tag)}",
|
||||
".function-box.is-string{background:var(--string)}",
|
||||
".function-box.is-dim{opacity:.5}",
|
||||
".string-list{display:flex;flex-direction:column;gap:2px}",
|
||||
".string-row{display:flex;align-items:flex-start;gap:8px;width:100%;border:1px solid transparent;background:transparent;color:inherit;padding:3px 4px;font:inherit;text-align:left;cursor:pointer}",
|
||||
".string-row:hover,.string-row.is-active{border-color:var(--line)}",
|
||||
".string-address{color:var(--muted);white-space:nowrap;flex:0 0 auto}",
|
||||
".string-value{min-width:0;flex:1 1 auto;white-space:pre-wrap;word-break:break-word}",
|
||||
".string-tags{margin-left:auto;flex:0 0 auto;padding-left:8px;color:var(--muted);white-space:nowrap}",
|
||||
".tooltip{position:fixed;z-index:10;display:none;width:min(42rem,calc(100vw - 24px));max-height:calc(100vh - 24px);overflow:auto;border:1px solid var(--line);background:#fff;padding:8px;pointer-events:none;white-space:pre-wrap;box-shadow:0 2px 8px rgba(0,0,0,.08)}",
|
||||
".tooltip.is-visible{display:block}",
|
||||
"h1,h2{font-size:inherit}",
|
||||
"</style>",
|
||||
"</head>",
|
||||
"<body>",
|
||||
"<main>",
|
||||
"<section>",
|
||||
f"<div>{escape(report.meta.name)}</div>",
|
||||
(
|
||||
f'<div class="meta">sha256 {escape(report.meta.sha256)} · arch {escape(report.meta.arch)}'
|
||||
f" · functions {len(report.functions)} · strings {len(program_strings)} · tags {len(tag_entries)}</div>"
|
||||
),
|
||||
"<h1>tags</h1>",
|
||||
'<div class="controls" id="tag-controls">',
|
||||
]
|
||||
|
||||
for tag, function_indices in tag_entries:
|
||||
parts.append(
|
||||
(
|
||||
f'<button type="button" class="control tag-control" data-tag="{escape(tag, quote=True)}" '
|
||||
f'data-count="{len(function_indices)}">{escape(tag)} '
|
||||
f'<span class="control-count">({len(function_indices)})</span></button>'
|
||||
)
|
||||
)
|
||||
|
||||
parts.extend(
|
||||
[
|
||||
"</div>",
|
||||
'<div class="legend">border = tag · fill = string · dim = matches neither</div>',
|
||||
"</section>",
|
||||
'<div class="split-view" id="split-view">',
|
||||
'<section class="pane pane-functions" id="functions-pane">',
|
||||
f'<div class="pane-header">functions ({len(report.functions)})</div>',
|
||||
'<div class="pane-body">',
|
||||
'<div class="function-grid" id="function-grid">',
|
||||
]
|
||||
)
|
||||
|
||||
for index, function in enumerate(report.functions):
|
||||
parts.append(
|
||||
(
|
||||
f'<div class="function-box" data-function-index="{index}" '
|
||||
f'data-function-address="{escape(hex(function.address), quote=True)}" '
|
||||
f'aria-label="{escape(function.name, quote=True)}"></div>'
|
||||
)
|
||||
)
|
||||
|
||||
parts.extend(
|
||||
[
|
||||
"</div>",
|
||||
"</div>",
|
||||
"</section>",
|
||||
'<div class="splitter" id="splitter" role="separator" aria-orientation="vertical" aria-label="resize panes"></div>',
|
||||
'<section class="pane pane-strings" id="strings-pane">',
|
||||
f'<div class="pane-header">strings ({len(program_strings)})</div>',
|
||||
'<div class="pane-body">',
|
||||
'<div class="string-list" id="string-list">',
|
||||
]
|
||||
)
|
||||
|
||||
for index, program_string in enumerate(program_strings):
|
||||
parts.append(_render_string_row(program_string, index))
|
||||
|
||||
parts.extend(
|
||||
[
|
||||
"</div>",
|
||||
"</div>",
|
||||
"</section>",
|
||||
"</div>",
|
||||
"</main>",
|
||||
'<div class="tooltip" id="tooltip"></div>',
|
||||
f'<script type="application/json" id="mapa-data">{_to_json(data)}</script>',
|
||||
"<script>",
|
||||
"const data=JSON.parse(document.getElementById('mapa-data').textContent);",
|
||||
"const splitView=document.getElementById('split-view');",
|
||||
"const functionsPane=document.getElementById('functions-pane');",
|
||||
"const splitter=document.getElementById('splitter');",
|
||||
"const functionBoxes=[...document.querySelectorAll('.function-box')];",
|
||||
"const tagControls=[...document.querySelectorAll('.tag-control')];",
|
||||
"const stringRows=[...document.querySelectorAll('.string-row')];",
|
||||
"const tooltip=document.getElementById('tooltip');",
|
||||
"let hoveredTag=null;",
|
||||
"let lockedTag=null;",
|
||||
"let hoveredString=null;",
|
||||
"let lockedString=null;",
|
||||
"let activePointerId=null;",
|
||||
"const getActiveTag=()=>lockedTag??hoveredTag;",
|
||||
"const getActiveString=()=>lockedString??hoveredString;",
|
||||
"const updateView=()=>{",
|
||||
" const activeTag=getActiveTag();",
|
||||
" const activeString=getActiveString();",
|
||||
" const tagMatches=new Set(activeTag?data.tags[activeTag]||[]:[]);",
|
||||
" const stringMatches=new Set(activeString===null?[]:data.strings[activeString].functionIndices);",
|
||||
" const hasActive=activeTag!==null||activeString!==null;",
|
||||
" functionBoxes.forEach((box,index)=>{",
|
||||
" const isTag=tagMatches.has(index);",
|
||||
" const isString=stringMatches.has(index);",
|
||||
" box.classList.toggle('is-tag',isTag);",
|
||||
" box.classList.toggle('is-string',isString);",
|
||||
" box.classList.toggle('is-dim',hasActive && !(isTag || isString));",
|
||||
" });",
|
||||
" tagControls.forEach((control)=>{",
|
||||
" control.classList.toggle('is-active',control.dataset.tag===activeTag);",
|
||||
" });",
|
||||
" stringRows.forEach((row)=>{",
|
||||
" row.classList.toggle('is-active',Number(row.dataset.stringIndex)===activeString);",
|
||||
" });",
|
||||
"};",
|
||||
"const placeTooltip=(event)=>{",
|
||||
" const offset=12;",
|
||||
" let left=event.clientX+offset;",
|
||||
" let top=event.clientY+offset;",
|
||||
" const rect=tooltip.getBoundingClientRect();",
|
||||
" if(left+rect.width>window.innerWidth-8){left=Math.max(8,window.innerWidth-rect.width-8);}",
|
||||
" if(top+rect.height>window.innerHeight-8){top=Math.max(8,window.innerHeight-rect.height-8);}",
|
||||
" tooltip.style.left=`${left}px`;",
|
||||
" tooltip.style.top=`${top}px`;",
|
||||
"};",
|
||||
"const getPaneMinWidth=()=>parseFloat(getComputedStyle(document.documentElement).fontSize)*18;",
|
||||
"const resizePanes=(clientX)=>{",
|
||||
" const rect=splitView.getBoundingClientRect();",
|
||||
" const splitterWidth=splitter.getBoundingClientRect().width;",
|
||||
" const paneMinWidth=getPaneMinWidth();",
|
||||
" const minLeft=rect.left+paneMinWidth;",
|
||||
" const maxLeft=rect.right-paneMinWidth-splitterWidth;",
|
||||
" if(maxLeft<=minLeft){functionsPane.style.flexBasis='50%';return;}",
|
||||
" const clampedLeft=Math.min(maxLeft,Math.max(minLeft,clientX));",
|
||||
" functionsPane.style.flexBasis=`${clampedLeft-rect.left}px`;",
|
||||
"};",
|
||||
"const clampPaneSize=()=>{",
|
||||
" const basis=parseFloat(functionsPane.style.flexBasis);",
|
||||
" if(Number.isFinite(basis)){resizePanes(splitView.getBoundingClientRect().left+basis);}",
|
||||
"};",
|
||||
"const stopResizing=(event)=>{",
|
||||
" if(activePointerId===null||event.pointerId!==activePointerId){return;}",
|
||||
" if(splitter.hasPointerCapture(event.pointerId)){splitter.releasePointerCapture(event.pointerId);}",
|
||||
" activePointerId=null;",
|
||||
" document.body.classList.remove('is-resizing');",
|
||||
"};",
|
||||
"functionBoxes.forEach((box,index)=>{",
|
||||
" box.addEventListener('mouseenter',(event)=>{",
|
||||
" tooltip.textContent=data.functions[index];",
|
||||
" tooltip.classList.add('is-visible');",
|
||||
" placeTooltip(event);",
|
||||
" });",
|
||||
" box.addEventListener('mousemove',placeTooltip);",
|
||||
" box.addEventListener('mouseleave',()=>{tooltip.classList.remove('is-visible');});",
|
||||
"});",
|
||||
"tagControls.forEach((control)=>{",
|
||||
" control.addEventListener('mouseenter',()=>{if(lockedTag===null){hoveredTag=control.dataset.tag;updateView();}});",
|
||||
" control.addEventListener('mouseleave',()=>{if(lockedTag===null){hoveredTag=null;updateView();}});",
|
||||
" control.addEventListener('click',()=>{lockedTag=lockedTag===control.dataset.tag?null:control.dataset.tag;hoveredTag=null;updateView();});",
|
||||
"});",
|
||||
"stringRows.forEach((row)=>{",
|
||||
" row.addEventListener('mouseenter',()=>{if(lockedString===null){hoveredString=Number(row.dataset.stringIndex);updateView();}});",
|
||||
" row.addEventListener('mouseleave',()=>{if(lockedString===null){hoveredString=null;updateView();}});",
|
||||
" row.addEventListener('click',()=>{const index=Number(row.dataset.stringIndex);lockedString=lockedString===index?null:index;hoveredString=null;updateView();});",
|
||||
"});",
|
||||
"splitter.addEventListener('pointerdown',(event)=>{",
|
||||
" activePointerId=event.pointerId;",
|
||||
" splitter.setPointerCapture(event.pointerId);",
|
||||
" document.body.classList.add('is-resizing');",
|
||||
" resizePanes(event.clientX);",
|
||||
" event.preventDefault();",
|
||||
"});",
|
||||
"splitter.addEventListener('pointermove',(event)=>{if(activePointerId===event.pointerId){resizePanes(event.clientX);}});",
|
||||
"splitter.addEventListener('pointerup',stopResizing);",
|
||||
"splitter.addEventListener('pointercancel',stopResizing);",
|
||||
"window.addEventListener('resize',clampPaneSize);",
|
||||
"updateView();",
|
||||
"</script>",
|
||||
"</body>",
|
||||
"</html>",
|
||||
]
|
||||
)
|
||||
|
||||
return "\n".join(parts)
|
||||
165
mapa/ida_db.py
165
mapa/ida_db.py
@@ -1,165 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import fcntl
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
import idapro # must be first: mutates sys.path so ida_auto and ida_domain are importable
|
||||
import ida_auto
|
||||
from ida_domain.database import Database, IdaCommandOptions
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATABASE_ACCESS_TIMEOUT = 5.0
|
||||
DATABASE_ANALYSIS_TIMEOUT = 120.0
|
||||
DATABASE_POLL_INTERVAL = 0.25
|
||||
|
||||
|
||||
def get_cache_dir() -> Path:
|
||||
xdg = os.environ.get("XDG_CACHE_HOME")
|
||||
if xdg:
|
||||
base = Path(xdg)
|
||||
else:
|
||||
base = Path.home() / ".cache"
|
||||
return base / "mandiant" / "mapa"
|
||||
|
||||
|
||||
def compute_file_hashes(file_path: Path) -> tuple[str, str]:
|
||||
"""Compute (md5, sha256) for a file.
|
||||
|
||||
Raises:
|
||||
OSError: If the file cannot be read.
|
||||
"""
|
||||
md5_digest = hashlib.md5()
|
||||
sha256_digest = hashlib.sha256()
|
||||
with file_path.open("rb") as fh:
|
||||
for chunk in iter(lambda: fh.read(65536), b""):
|
||||
md5_digest.update(chunk)
|
||||
sha256_digest.update(chunk)
|
||||
return md5_digest.hexdigest(), sha256_digest.hexdigest()
|
||||
|
||||
|
||||
def _wait_for_repack(db_path: Path, timeout: float) -> None:
|
||||
nam_path = db_path.with_suffix(".nam")
|
||||
deadline = time.monotonic() + timeout
|
||||
while nam_path.exists():
|
||||
if time.monotonic() >= deadline:
|
||||
raise RuntimeError(
|
||||
f"Database {db_path} appears to be open in another program "
|
||||
f"({nam_path} still exists after {timeout:.0f}s)."
|
||||
)
|
||||
time.sleep(DATABASE_POLL_INTERVAL)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def database_access_guard(db_path: Path, timeout: float) -> Iterator[None]:
|
||||
"""Advisory guard that serialises access to an IDA database.
|
||||
|
||||
Uses .nam polling + flock on <db>.lock with TOCTOU re-check.
|
||||
|
||||
Raises:
|
||||
RuntimeError: On timeout waiting for the database.
|
||||
"""
|
||||
_wait_for_repack(db_path, timeout)
|
||||
|
||||
lock_path = Path(str(db_path) + ".lock")
|
||||
lock_fd = lock_path.open("w")
|
||||
deadline = time.monotonic() + timeout
|
||||
try:
|
||||
while True:
|
||||
try:
|
||||
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
break
|
||||
except OSError:
|
||||
if time.monotonic() >= deadline:
|
||||
raise RuntimeError(
|
||||
f"Timed out waiting for lock on {db_path} after {timeout:.0f}s."
|
||||
)
|
||||
time.sleep(DATABASE_POLL_INTERVAL)
|
||||
|
||||
_wait_for_repack(db_path, max(0, deadline - time.monotonic()))
|
||||
yield
|
||||
finally:
|
||||
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
||||
lock_fd.close()
|
||||
|
||||
|
||||
def resolve_database(file_path: Path) -> tuple[Path, str, str]:
|
||||
"""Resolve an input path to an .i64/.idb database path.
|
||||
|
||||
Returns (db_path, md5, sha256). For existing databases, hashes are empty
|
||||
strings (they'll be read from IDA metadata instead).
|
||||
|
||||
Raises:
|
||||
RuntimeError: If analysis or caching fails.
|
||||
"""
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix in {".i64", ".idb"}:
|
||||
logger.debug("Using existing database: %s", file_path)
|
||||
return file_path, "", ""
|
||||
|
||||
cache_dir = get_cache_dir()
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
md5, sha256 = compute_file_hashes(file_path)
|
||||
cache_path = cache_dir / f"{sha256}.i64"
|
||||
|
||||
if cache_path.exists():
|
||||
logger.debug("Cache hit for %s -> %s", file_path, cache_path)
|
||||
return cache_path, md5, sha256
|
||||
|
||||
logger.debug("Cache miss for %s; analyzing to %s", file_path, cache_path)
|
||||
with database_access_guard(cache_path, timeout=DATABASE_ANALYSIS_TIMEOUT):
|
||||
if cache_path.exists():
|
||||
logger.debug("Cache populated while waiting for lock: %s", cache_path)
|
||||
return cache_path, md5, sha256
|
||||
|
||||
logger.info("Analyzing %s (this may take a moment)...", file_path.name)
|
||||
idapro.enable_console_messages(False)
|
||||
ida_options = IdaCommandOptions(
|
||||
auto_analysis=True,
|
||||
new_database=True,
|
||||
output_database=str(cache_path),
|
||||
load_resources=True,
|
||||
plugin_options="lumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0",
|
||||
)
|
||||
try:
|
||||
with Database.open(str(file_path), ida_options, save_on_close=True):
|
||||
ida_auto.auto_wait()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Analysis failed for {file_path}: {exc}") from exc
|
||||
|
||||
if not cache_path.exists():
|
||||
raise RuntimeError(f"Analysis produced no database for {file_path}")
|
||||
|
||||
logger.debug("Analysis completed: %s", cache_path)
|
||||
return cache_path, md5, sha256
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def open_database_session(db_path: Path, auto_analysis: bool = False) -> Iterator[Database]:
|
||||
"""Open a database session with advisory locking.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If opening fails or the database is locked.
|
||||
"""
|
||||
with database_access_guard(db_path, timeout=DATABASE_ACCESS_TIMEOUT):
|
||||
ida_options = IdaCommandOptions(auto_analysis=auto_analysis, new_database=False)
|
||||
logger.debug("Opening database session: %s (auto_analysis=%s)", db_path, auto_analysis)
|
||||
idapro.enable_console_messages(False)
|
||||
try:
|
||||
database = Database.open(str(db_path), ida_options, save_on_close=False)
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Failed to open {db_path}: {exc}") from exc
|
||||
|
||||
with database:
|
||||
if auto_analysis:
|
||||
ida_auto.auto_wait()
|
||||
yield database
|
||||
|
||||
logger.debug("Closed database session: %s", db_path)
|
||||
126
mapa/model.py
126
mapa/model.py
@@ -1,126 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AssemblageRecord:
|
||||
sha256: str
|
||||
name: str
|
||||
start_rva: int
|
||||
end_rva: int
|
||||
address: int
|
||||
end_address: int
|
||||
source_file: str
|
||||
|
||||
@property
|
||||
def source_path(self) -> str:
|
||||
if not self.source_file.endswith(")"):
|
||||
return self.source_file
|
||||
head, separator, _ = self.source_file.rpartition(" (")
|
||||
if separator:
|
||||
return head
|
||||
return self.source_file
|
||||
|
||||
@classmethod
|
||||
def from_csv_row(
|
||||
cls, row: Mapping[str, str], base_address: int
|
||||
) -> AssemblageRecord:
|
||||
start_rva = int(row["start"], 0)
|
||||
end_rva = int(row["end"], 0)
|
||||
return cls(
|
||||
sha256=row["hash"].strip().lower(),
|
||||
name=row["name"].strip(),
|
||||
start_rva=start_rva,
|
||||
end_rva=end_rva,
|
||||
address=base_address + start_rva,
|
||||
end_address=base_address + end_rva,
|
||||
source_file=row["source_file"].strip(),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MapaString:
|
||||
value: str
|
||||
address: int
|
||||
tags: tuple[str, ...] = ()
|
||||
tag_matches: tuple = ()
|
||||
|
||||
|
||||
@dataclass
|
||||
class MapaProgramString:
|
||||
value: str
|
||||
address: int
|
||||
tags: tuple[str, ...] = ()
|
||||
tag_matches: tuple = ()
|
||||
function_addresses: tuple[int, ...] = ()
|
||||
|
||||
|
||||
@dataclass
|
||||
class MapaCall:
|
||||
name: str
|
||||
address: int
|
||||
is_api: bool
|
||||
delta: int = 0
|
||||
direction: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class MapaCaller:
|
||||
name: str
|
||||
address: int
|
||||
delta: int = 0
|
||||
direction: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class MapaFunction:
|
||||
address: int
|
||||
name: str
|
||||
is_thunk: bool = False
|
||||
is_library: bool = False
|
||||
num_basic_blocks: int = 0
|
||||
num_edges: int = 0
|
||||
num_instructions: int = 0
|
||||
total_instruction_bytes: int = 0
|
||||
callers: list[MapaCaller] = field(default_factory=list)
|
||||
calls: list[MapaCall] = field(default_factory=list)
|
||||
apis: list[MapaCall] = field(default_factory=list)
|
||||
strings: list[MapaString] = field(default_factory=list)
|
||||
capa_matches: list[str] = field(default_factory=list)
|
||||
assemblage_records: list[AssemblageRecord] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MapaSection:
|
||||
address: int
|
||||
size: int
|
||||
perms: str
|
||||
name: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class MapaLibrary:
|
||||
name: str
|
||||
is_static: bool = False
|
||||
load_address: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MapaMeta:
|
||||
name: str
|
||||
sha256: str
|
||||
md5: str = ""
|
||||
arch: str = ""
|
||||
timestamp: str = ""
|
||||
base_address: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class MapaReport:
|
||||
meta: MapaMeta
|
||||
sections: list[MapaSection] = field(default_factory=list)
|
||||
libraries: list[MapaLibrary] = field(default_factory=list)
|
||||
functions: list[MapaFunction] = field(default_factory=list)
|
||||
program_strings: list[MapaProgramString] = field(default_factory=list)
|
||||
273
mapa/renderer.py
273
mapa/renderer.py
@@ -1,273 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
|
||||
import rich.padding
|
||||
from rich.console import Console
|
||||
from rich.markup import escape
|
||||
from rich.text import Text
|
||||
|
||||
from mapa.model import MapaFunction, MapaReport, MapaString
|
||||
|
||||
|
||||
class Renderer:
|
||||
def __init__(self, console: Console):
|
||||
self.console: Console = console
|
||||
self.indent: int = 0
|
||||
|
||||
@contextlib.contextmanager
|
||||
def indenting(self):
|
||||
self.indent += 1
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
self.indent -= 1
|
||||
|
||||
@staticmethod
|
||||
def markup(s: str, **kwargs) -> Text:
|
||||
escaped_args = {
|
||||
k: (escape(v) if isinstance(v, str) else v) for k, v in kwargs.items()
|
||||
}
|
||||
return Text.from_markup(s.format(**escaped_args))
|
||||
|
||||
def print(self, renderable, **kwargs):
|
||||
if not kwargs:
|
||||
return self.console.print(
|
||||
rich.padding.Padding(renderable, (0, 0, 0, self.indent * 2))
|
||||
)
|
||||
assert isinstance(renderable, str)
|
||||
return self.print(self.markup(renderable, **kwargs))
|
||||
|
||||
def writeln(self, s: str):
|
||||
self.print(s)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def section(self, name):
|
||||
if isinstance(name, str):
|
||||
self.print("[title]{name}", name=name)
|
||||
elif isinstance(name, Text):
|
||||
name = name.copy()
|
||||
name.stylize_before(self.console.get_style("title"))
|
||||
self.print(name)
|
||||
else:
|
||||
raise ValueError("unexpected section name")
|
||||
with self.indenting():
|
||||
yield
|
||||
|
||||
|
||||
def _visible_tags(tags: tuple[str, ...]) -> list[str]:
|
||||
tag_set = set(tags)
|
||||
has_specific = any(t != "#common" for t in tag_set)
|
||||
result = []
|
||||
for t in tags:
|
||||
if t == "#common" and has_specific:
|
||||
continue
|
||||
result.append(t)
|
||||
return result
|
||||
|
||||
|
||||
def _render_string_line(o: Renderer, value: str, tags: list[str]) -> Text:
|
||||
left = Text.from_markup(
|
||||
'string: [decoration]"[/]{string}[decoration]"[/]'.format(
|
||||
string=escape(value)
|
||||
)
|
||||
)
|
||||
right = Text(" ".join(tags), style="dim")
|
||||
|
||||
available = o.console.size.width - (o.indent * 2)
|
||||
min_gap = 1
|
||||
right_len = right.cell_len
|
||||
max_left = available - right_len - min_gap
|
||||
if max_left < 12:
|
||||
combined = left.copy()
|
||||
combined.append(" ")
|
||||
combined.append(right)
|
||||
return combined
|
||||
|
||||
if left.cell_len > max_left:
|
||||
left.truncate(max_left - 1, overflow="ellipsis")
|
||||
|
||||
padding = available - left.cell_len - right_len
|
||||
combined = left.copy()
|
||||
combined.append(" " * padding)
|
||||
combined.append(right)
|
||||
return combined
|
||||
|
||||
|
||||
def _get_primary_source_path(func) -> str | None:
|
||||
if not func.assemblage_records:
|
||||
return None
|
||||
source_path = func.assemblage_records[0].source_path
|
||||
if not source_path:
|
||||
return None
|
||||
return source_path
|
||||
|
||||
|
||||
def _render_source_path_separator(o: Renderer, source_path: str) -> Text:
|
||||
label = f"[ {source_path} ]"
|
||||
available = max(0, o.console.size.width - (o.indent * 2))
|
||||
if available <= len(label) + 2:
|
||||
return Text(label, style="decoration")
|
||||
|
||||
rule_len = available - len(label) - 2
|
||||
left_len = rule_len // 2
|
||||
right_len = rule_len - left_len
|
||||
|
||||
rendered = Text("-" * left_len, style="decoration")
|
||||
rendered.append(" ")
|
||||
rendered.append(label, style="decoration")
|
||||
rendered.append(" ")
|
||||
rendered.append("-" * right_len, style="decoration")
|
||||
return rendered
|
||||
|
||||
|
||||
def _format_function_heading(func: MapaFunction) -> str:
|
||||
kind = "thunk" if func.is_thunk else "function"
|
||||
return f"{kind} {func.name} @ {hex(func.address)}"
|
||||
|
||||
|
||||
def _iter_function_rows(func: MapaFunction):
|
||||
for record in func.assemblage_records:
|
||||
yield "assemblage", record
|
||||
|
||||
if func.is_thunk:
|
||||
return
|
||||
|
||||
for caller in func.callers:
|
||||
yield "caller", caller
|
||||
|
||||
yield "metrics", None
|
||||
|
||||
for match in func.capa_matches:
|
||||
yield "capa", match
|
||||
|
||||
for call in func.calls:
|
||||
yield "call", call
|
||||
|
||||
for api in func.apis:
|
||||
yield "api", api
|
||||
|
||||
for string in func.strings:
|
||||
yield "string", string
|
||||
|
||||
|
||||
def _render_plain_string_line(string: MapaString) -> str:
|
||||
visible_tags = _visible_tags(string.tags)
|
||||
line = f'string: "{string.value}"'
|
||||
if visible_tags:
|
||||
line += f" {' '.join(visible_tags)}"
|
||||
return line
|
||||
|
||||
|
||||
def render_function_summary_text(func: MapaFunction) -> str:
|
||||
lines = [_format_function_heading(func)]
|
||||
for kind, value in _iter_function_rows(func):
|
||||
if kind == "assemblage":
|
||||
lines.append(f"assemblage name: {value.name}")
|
||||
lines.append(f"assemblage file: {value.source_path}")
|
||||
elif kind == "caller":
|
||||
lines.append(
|
||||
f"xref: {value.direction} {value.name} ({value.delta:+})"
|
||||
)
|
||||
elif kind == "metrics":
|
||||
lines.append(
|
||||
f"B/E/I: {func.num_basic_blocks} / {func.num_edges} / {func.num_instructions} ({func.total_instruction_bytes} bytes)"
|
||||
)
|
||||
elif kind == "capa":
|
||||
lines.append(f"capa: {value}")
|
||||
elif kind == "call":
|
||||
lines.append(
|
||||
f"calls: {value.direction} {value.name} ({value.delta:+})"
|
||||
)
|
||||
elif kind == "api":
|
||||
lines.append(f"api: {value.name}")
|
||||
elif kind == "string":
|
||||
lines.append(_render_plain_string_line(value))
|
||||
else:
|
||||
raise ValueError(f"unexpected function row kind: {kind}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def render_report(report: MapaReport, console: Console) -> None:
|
||||
o = Renderer(console)
|
||||
|
||||
with o.section("meta"):
|
||||
o.writeln(f"name: {report.meta.name}")
|
||||
o.writeln(f"sha256: {report.meta.sha256}")
|
||||
o.writeln(f"arch: {report.meta.arch}")
|
||||
o.writeln(f"ts: {report.meta.timestamp}")
|
||||
|
||||
with o.section("sections"):
|
||||
for section in report.sections:
|
||||
o.writeln(f"- {hex(section.address)} {section.perms} {hex(section.size)}")
|
||||
|
||||
with o.section("libraries"):
|
||||
for lib in report.libraries:
|
||||
static = " (static)" if lib.is_static else ""
|
||||
addr = (
|
||||
f" at {hex(lib.load_address)}" if lib.load_address is not None else ""
|
||||
)
|
||||
o.writeln(f"- {lib.name:<12s}{static}{addr}")
|
||||
if not report.libraries:
|
||||
o.writeln("(none)")
|
||||
|
||||
with o.section("functions"):
|
||||
last_source_path: str | None = None
|
||||
for func in report.functions:
|
||||
source_path = _get_primary_source_path(func)
|
||||
if source_path is not None:
|
||||
if last_source_path is not None and source_path != last_source_path:
|
||||
o.print(_render_source_path_separator(o, source_path))
|
||||
last_source_path = source_path
|
||||
|
||||
with o.section(
|
||||
o.markup(
|
||||
"{function_kind} [default]{function_name}[/] [decoration]@ {function_address}[/]",
|
||||
function_kind="thunk" if func.is_thunk else "function",
|
||||
function_name=func.name,
|
||||
function_address=hex(func.address),
|
||||
)
|
||||
):
|
||||
for kind, value in _iter_function_rows(func):
|
||||
if kind == "assemblage":
|
||||
o.writeln(f"assemblage name: {value.name}")
|
||||
o.writeln(f"assemblage file: {value.source_path}")
|
||||
elif kind == "caller":
|
||||
o.print(
|
||||
"xref: [decoration]{direction}[/] {name} [decoration]({delta:+})[/]",
|
||||
direction=value.direction,
|
||||
name=value.name,
|
||||
delta=value.delta,
|
||||
)
|
||||
elif kind == "metrics":
|
||||
o.writeln(
|
||||
f"B/E/I: {func.num_basic_blocks} / {func.num_edges} / {func.num_instructions} ({func.total_instruction_bytes} bytes)"
|
||||
)
|
||||
elif kind == "capa":
|
||||
o.writeln(f"capa: {value}")
|
||||
elif kind == "call":
|
||||
o.print(
|
||||
"calls: [decoration]{direction}[/] {name} [decoration]({delta:+})[/]",
|
||||
direction=value.direction,
|
||||
name=value.name,
|
||||
delta=value.delta,
|
||||
)
|
||||
elif kind == "api":
|
||||
o.print(
|
||||
"api: {name}",
|
||||
name=value.name,
|
||||
)
|
||||
elif kind == "string":
|
||||
visible_tags = _visible_tags(value.tags)
|
||||
if visible_tags:
|
||||
o.print(_render_string_line(o, value.value, visible_tags))
|
||||
else:
|
||||
o.print(
|
||||
'string: [decoration]"[/]{string}[decoration]"[/]',
|
||||
string=value.value,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"unexpected function row kind: {kind}")
|
||||
|
||||
if not func.is_thunk:
|
||||
o.print("")
|
||||
@@ -1,25 +0,0 @@
|
||||
# Vendored Quantum Strand string databases
|
||||
|
||||
Upstream: `../quantumstrand/` (https://github.com/mandiant/flare-floss, Quantum Strand branch)
|
||||
Upstream commit: `73eb1541e896c065fc694ba7b01067f56871631b`
|
||||
|
||||
## Copied data files
|
||||
|
||||
- `data/oss/*.jsonl.gz` — open-source library string databases
|
||||
- `data/crt/msvc_v143.jsonl.gz` — MSVC CRT string database
|
||||
- `data/expert/capa.jsonl` — expert tagging rules (plain JSONL, not gzipped)
|
||||
- `data/winapi/apis.txt.gz` — Windows API function names
|
||||
- `data/winapi/dlls.txt.gz` — Windows DLL names
|
||||
- `data/gp/gp.jsonl.gz` — global prevalence strings
|
||||
- `data/gp/cwindb-native.jsonl.gz` — CWinDB native prevalence strings
|
||||
- `data/gp/cwindb-dotnet.jsonl.gz` — CWinDB .NET prevalence strings
|
||||
- `data/gp/junk-code.jsonl.gz` — junk/compiler-generated code strings
|
||||
- `data/gp/xaa-hashes.bin` — truncated MD5 hash set (8 bytes per entry)
|
||||
- `data/gp/yaa-hashes.bin` — truncated MD5 hash set (8 bytes per entry)
|
||||
|
||||
## Code
|
||||
|
||||
The loader, tagger, and model code in this package are mapa-local rewrites
|
||||
inspired by upstream modules `floss/qs/db/oss.py`, `expert.py`, `gp.py`,
|
||||
`winapi.py`, and the tagging logic in `floss/qs/main.py`. No upstream code
|
||||
was copied verbatim.
|
||||
@@ -1 +0,0 @@
|
||||
from __future__ import annotations
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user