diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 64475f65..841044ee 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,6 +1,6 @@ name: CI -on: +on: push: branches: [ master ] pull_request: @@ -136,3 +136,55 @@ jobs: env: BN_LICENSE: ${{ secrets.BN_LICENSE }} run: pytest -v tests/test_binja_features.py # explicitly refer to the binja tests for performance. other tests run above. + + ghidra-tests: + name: Ghidra tests for ${{ matrix.python-version }} + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + python-version: ["3.7", "3.11"] + java-version: ["17"] + steps: + - name: Checkout capa with submodules + uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0 + with: + submodules: recursive + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0 + with: + python-version: ${{ matrix.python-version }} + - name: Set up Java ${{ matrix.java-version }} + uses: actions/setup-java@5ffc13f4174014e2d4d4572b3d74c3fa61aeb2c2 # v3 + with: + distribution: 'temurin' + java-version: ${{ matrix.java-version }} + - name: Set up Gradle 7.3 # must be done manually due to no gradle build in capa + run: | + mkdir /opt/gradle + wget "https://services.gradle.org/distributions/gradle-7.3-bin.zip" -O /opt/gradle/gradle-7.3.zip + unzip /opt/gradle/gradle-7.3.zip -d /opt/gradle + - name: Install Ghidra 10.3 + run: | + mkdir ./.github/ghidra + wget "https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_10.3_build/ghidra_10.3_PUBLIC_20230510.zip" -O ./.github/ghidra/ghidra_10.3_PUBLIC.zip + unzip .github/ghidra/ghidra_10.3_PUBLIC.zip -d .github/ghidra/ + - name: Install Jep 4.1.1 + run : | + mkdir ./.github/jep + wget "https://github.com/ninia/jep/archive/refs/tags/v4.1.1.zip" -O ./.github/jep/jep-4.1.1.zip + unzip .github/jep/jep-4.1.1.zip -d .github/jep/ + pip install .github/jep/jep-4.1.1/ + - name: Install Ghidrathon + run : | + mkdir ./.github/ghidrathon + wget "https://github.com/mandiant/Ghidrathon/archive/refs/tags/v2.1.0.zip" -O ./.github/ghidrathon/ghidrathon-2.1.0.zip + unzip .github/ghidrathon/ghidrathon-2.1.0.zip -d .github/ghidrathon/ + workdir=$(pwd) + /opt/gradle/gradle-7.3/bin/gradle -p ./.github/ghidrathon/Ghidrathon-2.1.0/ -PGHIDRA_INSTALL_DIR=$workdir/.github/ghidra/ghidra_10.3_PUBLIC + unzip .github/ghidrathon/Ghidrathon-2.1.0/dist/*.zip -d $workdir/.github/ghidra/ghidra_10.3_PUBLIC/Extensions + - name: Install pyyaml + run: sudo apt-get install -y libyaml-dev + - name: Install capa + run: pip install -e .[dev] + diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fe6ae2b..27915d5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### New Features - Utility script to detect feature overlap between new and existing CAPA rules [#1451](https://github.com/mandiant/capa/issues/1451) [@Aayush-Goel-04](https://github.com/aayush-goel-04) +- extractor: Implement Ghidra Backend [@colton-gabertan](https://github.com/colton-gabertan) ### Breaking Changes @@ -16,7 +17,7 @@ - communication/mailslot/read-from-mailslot nick.simonian@mandiant.com - nursery/hash-data-using-sha512managed-in-dotnet jonathanlepore@google.com - nursery/compiled-with-exescript jonathanlepore@google.com -- + ### Bug Fixes - extractor: update vivisect Arch extraction #1334 @mr-tz diff --git a/capa/features/extractors/ghidra/__init__.py b/capa/features/extractors/ghidra/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/capa/features/extractors/ghidra/extractor.py b/capa/features/extractors/ghidra/extractor.py new file mode 100644 index 00000000..2f5a593a --- /dev/null +++ b/capa/features/extractors/ghidra/extractor.py @@ -0,0 +1,15 @@ +import logging +import contextlib +from typing import Tuple, Iterator + +from capa.features.common import Feature +from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.extractors.base_extractor import FeatureExtractor +import capa.features.extractors.ghidra.global_ + +class GhidraFeatureExtractor(FeatureExtractor): + def __init__(self): + super().__init__() + self.global_features: List[Tuple[Feature, Address]] = [] + self.global_features.extend(capa.features.extractors.ghidra.global_.extract_os()) + self.global_features.extend(capa.features.extractors.ghidra.global_.extract_arch()) diff --git a/capa/features/extractors/ghidra/file.py b/capa/features/extractors/ghidra/file.py new file mode 100644 index 00000000..e69de29b diff --git a/capa/features/extractors/ghidra/global_.py b/capa/features/extractors/ghidra/global_.py new file mode 100644 index 00000000..b0fad5cf --- /dev/null +++ b/capa/features/extractors/ghidra/global_.py @@ -0,0 +1,81 @@ +import logging +import contextlib +from io import BytesIO +from typing import Tuple, Iterator + +# imports for clarity +# note: currentProgram is a static variable accessible in +# the specific ghidra runtime environment +import ghidra.program.database.mem +import ghidra.program.flatapi as flatapi +ghidraapi = flatapi.FlatProgramAPI(currentProgram) # Ghidrathon hacks :) + +import capa.features.extractors.elf +from capa.features.common import OS, ARCH_I386, ARCH_AMD64, OS_WINDOWS, Arch, Feature +from capa.features.address import NO_ADDRESS, Address + +logger = logging.getLogger(__name__) + +def extract_os() -> Iterator[Tuple[Feature, Address]]: + current_program = ghidraapi.getCurrentProgram() + format_name: str = current_program.getExecutableFormat() + + if "PE" in format_name: + yield OS(OS_WINDOWS), NO_ADDRESS + + elif "ELF" in format_name: + program_memory = current_program.getMemory() # ghidra.program.database.mem.MemoryMapDB + fbytes_list = program_memory.getAllFileBytes() # java.util.List + fbytes = fbytes_list[0] # ghidra.program.database.mem.FileBytes + + # Java likes to return signed ints, so we must convert them + # back into unsigned bytes manually and write to BytesIO + # note: May be deprecated if Jep has implements better support for Java Lists + pb_arr = b'' + for i in range(fbytes.getSize()): + pb_arr = pb_arr + (fbytes.getOriginalByte(i) & 0xff).to_bytes(1, 'little') + buf = BytesIO(pb_arr) + + with contextlib.closing(buf) as f: + os = capa.features.extractors.elf.detect_elf_os(f) + + yield OS(os), NO_ADDRESS + + else: + # we likely end up here: + # 1. handling shellcode, or + # 2. handling a new file format (e.g. macho) + # + # for (1) we can't do much - its shellcode and all bets are off. + # we could maybe accept a further CLI argument to specify the OS, + # but i think this would be rarely used. + # rules that rely on OS conditions will fail to match on shellcode. + # + # for (2), this logic will need to be updated as the format is implemented. + logger.debug("unsupported file format: %s, will not guess OS", format_name) + return + + +def extract_arch() -> Iterator[Tuple[Feature, Address]]: + current_program = ghidraapi.getCurrentProgram() + lang_id = current_program.getMetadata().get('Language ID') + + if 'x86' in lang_id and '64' in lang_id: + yield Arch(ARCH_AMD64), NO_ADDRESS + + elif 'x86' in lang_id and '32' in lang_id: + yield Arch(ARCH_I386), NO_ADDRESS + + elif 'x86' not in lang_id: + logger.debug("unsupported architecture: non-32-bit nor non-64-bit intel") + return + + else: + # we likely end up here: + # 1. handling a new architecture (e.g. aarch64) + # + # for (1), this logic will need to be updated as the format is implemented. + logger.debug("unsupported architecture: %s", lang_id) + return + + diff --git a/capa/main.py b/capa/main.py index b305673c..d4978d31 100644 --- a/capa/main.py +++ b/capa/main.py @@ -1317,6 +1317,33 @@ def ida_main(): print(capa.render.default.render(meta, rules, capabilities)) +def ghidra_main(): + import capa.rules + #import capa.render.default + #import capa.features.extractors.ghidra.extractor + import capa.features.extractors.ghidra.global_ + + logging.basicConfig(level=logging.INFO) + logging.getLogger().setLevel(logging.INFO) + + logger.debug("-" * 80) + logger.debug(" Using default embedded rules.") + logger.debug(" ") + logger.debug(" You can see the current default rule set here:") + logger.debug(" https://github.com/mandiant/capa-rules") + logger.debug("-" * 80) + + rules_path = os.path.join(get_default_root(), "rules") + logger.debug("rule path: %s", rules_path) + rules = get_rules([rules_path]) + + # temp test for OS & ARCH extractions + globl_features: List[Tuple[Feature, Address]] = [] + globl_features.extend(capa.features.extractors.ghidra.global_.extract_os()) + globl_features.extend(capa.features.extractors.ghidra.global_.extract_arch()) + print(globl_features) + + def is_runtime_ida(): try: import idc @@ -1326,8 +1353,20 @@ def is_runtime_ida(): return True +def is_runtime_ghidra(): + try: + import ghidra.program.flatapi + except ImportError: + return False + else: + return True + + if __name__ == "__main__": if is_runtime_ida(): ida_main() + elif is_runtime_ghidra(): + ghidra_main() else: sys.exit(main()) + diff --git a/rules b/rules index 188e6552..312d4cad 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit 188e65528ec496eaaa792c3470cb4ab680a1b156 +Subproject commit 312d4cad891498e1d360dffcc98f669b63869c94 diff --git a/tests/fixtures.py b/tests/fixtures.py index 04c9c53b..612f49a7 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -183,6 +183,18 @@ def get_binja_extractor(path): return extractor +@lru_cache(maxsize=1) +def get_ghidra_extractor(path): + import capa.features.extractors.ghidra.extractor + + extractor = capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor(path) + + # overload the extractor so that the fixture exposes `extractor.path` + setattr(extractor, "path", path) + + return extractor + + def extract_global_features(extractor): features = collections.defaultdict(set) for feature, va in extractor.extract_global_features(): diff --git a/tests/test_ghidra_features.py b/tests/test_ghidra_features.py new file mode 100644 index 00000000..ff8e6485 --- /dev/null +++ b/tests/test_ghidra_features.py @@ -0,0 +1,55 @@ +import sys +import logging +import os.path +import binascii +import traceback + +import pytest + +try: + sys.path.append(os.path.dirname(__file__)) + import fixtures + from fixtures import * +finally: + sys.path.pop() + + +logger = logging.getLogger("test_ghidra_features") + + +# We need to skip the ghidra test if we cannot import ghidra modules, e.g., in GitHub CI. +ghidra_present: bool = False +try: + import ghidra.program.flatapi as flatapi + ghidraapi = flatapi.FlatProgramAPI(currentProgram) + + try: + current_program_test = ghidraapi.getCurrentProgram() + except RuntimeError as e: + logger.warning("Ghidra runtime not detected") + else: + ghidra_present = True +except ImportError: + pass + + +@pytest.mark.skipif(ghidra_present is False, reason="Skip ghidra tests if the ghidra Python API is not installed") +@fixtures.parametrize( + "sample,scope,feature,expected", + fixtures.FEATURE_PRESENCE_TESTS, + indirect=["sample", "scope"], +) +def test_ghidra_features(sample, scope, feature, expected): + fixtures.do_test_feature_presence(fixtures.get_ghidra_extractor, sample, scope, feature, expected) + + +@pytest.mark.skipif(ghidra_present is False, reason="Skip ghidra tests if the ghidra Python API is not installed") +@fixtures.parametrize( + "sample,scope,feature,expected", + fixtures.FEATURE_COUNT_TESTS, + indirect=["sample", "scope"], +) +def test_ghidra_feature_counts(sample, scope, feature, expected): + fixtures.do_test_feature_count(fixtures.get_ghidra_extractor, sample, scope, feature, expected) + +