From bf233c1c7af3a6dff544abfe389445db4b9d39d4 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 17 Oct 2023 10:56:35 +0000 Subject: [PATCH] integrate Ghidra backend with dynamic analysis --- capa/features/extractors/ghidra/extractor.py | 24 ++++++++-- capa/ghidra/helpers.py | 7 +-- capa/main.py | 8 ++-- scripts/show-features.py | 48 +------------------- 4 files changed, 31 insertions(+), 56 deletions(-) diff --git a/capa/features/extractors/ghidra/extractor.py b/capa/features/extractors/ghidra/extractor.py index d4439f0f..0c3db587 100644 --- a/capa/features/extractors/ghidra/extractor.py +++ b/capa/features/extractors/ghidra/extractor.py @@ -14,14 +14,32 @@ import capa.features.extractors.ghidra.function import capa.features.extractors.ghidra.basicblock from capa.features.common import Feature from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + FunctionHandle, + StaticFeatureExtractor, +) -class GhidraFeatureExtractor(FeatureExtractor): +class GhidraFeatureExtractor(StaticFeatureExtractor): def __init__(self): - super().__init__() import capa.features.extractors.ghidra.helpers as ghidra_helpers + super().__init__( + SampleHashes( + md5=capa.ghidra.helpers.get_file_md5(), + # ghidra doesn't expose this hash. + # https://ghidra.re/ghidra_docs/api/ghidra/program/model/listing/Program.html + # + # the hashes are stored in the database, not computed on the fly, + # so its probably not trivial to add SHA1. + sha1="", + sha256=capa.ghidra.helpers.get_file_sha256(), + ) + ) + self.global_features: List[Tuple[Feature, Address]] = [] self.global_features.extend(capa.features.extractors.ghidra.file.extract_file_format()) self.global_features.extend(capa.features.extractors.ghidra.global_.extract_os()) diff --git a/capa/ghidra/helpers.py b/capa/ghidra/helpers.py index b7debc16..b32c534a 100644 --- a/capa/ghidra/helpers.py +++ b/capa/ghidra/helpers.py @@ -143,17 +143,18 @@ def collect_metadata(rules: List[Path]): sha256=sha256, path=currentProgram().getExecutablePath(), # type: ignore [name-defined] # noqa: F821 ), - analysis=rdoc.Analysis( + flavor=rdoc.Flavor.STATIC, + analysis=rdoc.StaticAnalysis( format=currentProgram().getExecutableFormat(), # type: ignore [name-defined] # noqa: F821 arch=arch, os=os, extractor="ghidra", rules=tuple(r.resolve().absolute().as_posix() for r in rules), base_address=capa.features.freeze.Address.from_capa(currentProgram().getImageBase().getOffset()), # type: ignore [name-defined] # noqa: F821 - layout=rdoc.Layout( + layout=rdoc.StaticLayout( functions=(), ), - feature_counts=rdoc.FeatureCounts(file=0, functions=()), + feature_counts=rdoc.StaticFeatureCounts(file=0, functions=()), library_functions=(), ), ) diff --git a/capa/main.py b/capa/main.py index b2ed2dfd..36cc13c1 100644 --- a/capa/main.py +++ b/capa/main.py @@ -539,11 +539,13 @@ def find_dynamic_capabilities( return matches, meta -def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, **kwargs) -> Tuple[MatchResults, Any]: +def find_capabilities( + ruleset: RuleSet, extractor: FeatureExtractor, disable_progress=None, **kwargs +) -> Tuple[MatchResults, Any]: if isinstance(extractor, StaticFeatureExtractor): - return find_static_capabilities(ruleset, extractor, kwargs) + return find_static_capabilities(ruleset, extractor, disable_progress=disable_progress, **kwargs) elif isinstance(extractor, DynamicFeatureExtractor): - return find_dynamic_capabilities(ruleset, extractor, kwargs) + return find_dynamic_capabilities(ruleset, extractor, disable_progress=disable_progress, **kwargs) else: raise ValueError(f"unexpected extractor type: {extractor.__class__.__name__}") diff --git a/scripts/show-features.py b/scripts/show-features.py index 974880db..c8461cd4 100644 --- a/scripts/show-features.py +++ b/scripts/show-features.py @@ -315,52 +315,6 @@ def ida_main(): return 0 -def print_features(functions, extractor: capa.features.extractors.base_extractor.FeatureExtractor): - for f in functions: - if extractor.is_library_function(f.address): - function_name = extractor.get_function_name(f.address) - logger.debug("skipping library function %s (%s)", format_address(f.address), function_name) - continue - - print(f"func: {format_address(f.address)}") - - for feature, addr in extractor.extract_function_features(f): - if capa.features.common.is_global_feature(feature): - continue - - if f.address != addr: - print(f" func: {format_address(f.address)}: {feature} -> {format_address(addr)}") - else: - print(f" func: {format_address(f.address)}: {feature}") - - for bb in extractor.get_basic_blocks(f): - for feature, addr in extractor.extract_basic_block_features(f, bb): - if capa.features.common.is_global_feature(feature): - continue - - if bb.address != addr: - print(f" bb: {format_address(bb.address)}: {feature} -> {format_address(addr)}") - else: - print(f" bb: {format_address(bb.address)}: {feature}") - - for insn in extractor.get_instructions(f, bb): - for feature, addr in extractor.extract_insn_features(f, bb, insn): - if capa.features.common.is_global_feature(feature): - continue - - try: - if insn.address != addr: - print( - f" insn: {format_address(f.address)}: {format_address(insn.address)}: {feature} -> {format_address(addr)}" - ) - else: - print(f" insn: {format_address(insn.address)}: {feature}") - - except UnicodeEncodeError: - # may be an issue while piping to less and encountering non-ascii characters - continue - - def ghidra_main(): import capa.features.extractors.ghidra.extractor @@ -371,7 +325,7 @@ def ghidra_main(): function_handles = tuple(extractor.get_functions()) - print_features(function_handles, extractor) + print_static_features(function_handles, extractor) return 0