diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index fa9df37c..0c73e29f 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -106,13 +106,14 @@ class StaticFeatureExtractor: __metaclass__ = abc.ABCMeta - def __init__(self): + def __init__(self, hashes: SampleHashes): # # note: a subclass should define ctor parameters for its own use. # for example, the Vivisect feature extract might require the vw and/or path. # this base class doesn't know what to do with that info, though. # super().__init__() + self.sample_hashes = hashes @abc.abstractmethod def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: @@ -130,7 +131,7 @@ class StaticFeatureExtractor: """ fetch the hashes for the sample contained within the extractor. """ - raise NotImplementedError() + return self.sample_hashes @abc.abstractmethod def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: @@ -353,20 +354,21 @@ class DynamicFeatureExtractor: __metaclass__ = abc.ABCMeta - def __init__(self): + def __init__(self, hashes: SampleHashes): # # note: a subclass should define ctor parameters for its own use. # for example, the Vivisect feature extract might require the vw and/or path. # this base class doesn't know what to do with that info, though. # super().__init__() + self.sample_hashes = hashes @abc.abstractmethod def get_sample_hashes(self) -> SampleHashes: """ fetch the hashes for the sample contained within the extractor. """ - raise NotImplementedError() + return self.sample_hashes @abc.abstractmethod def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py index 9f63aebb..ad021dee 100644 --- a/capa/features/extractors/binja/extractor.py +++ b/capa/features/extractors/binja/extractor.py @@ -29,20 +29,16 @@ from capa.features.extractors.base_extractor import ( class BinjaFeatureExtractor(StaticFeatureExtractor): def __init__(self, bv: binja.BinaryView): - super().__init__() + super().__init__(hashes=SampleHashes.from_bytes(Path(bv.file.original_filename).read_bytes())) self.bv = bv self.global_features: List[Tuple[Feature, Address]] = [] self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv)) - self.sample_hashes = SampleHashes.from_bytes(Path(bv.file.original_filename).read_bytes()) def get_base_address(self): return AbsoluteVirtualAddress(self.bv.start) - def get_sample_hashes(self) -> SampleHashes: - return self.sample_hashes - def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index c3da7606..2a070c91 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -33,15 +33,14 @@ TESTED_VERSIONS = {"2.2-CAPE", "2.4-CAPE"} class CapeExtractor(DynamicFeatureExtractor): def __init__(self, report: CapeReport): - super().__init__() - self.report: CapeReport = report - - self.sample_hashes = SampleHashes( - md5=self.report.target.file.md5.lower(), - sha1=self.report.target.file.sha1.lower(), - sha256=self.report.target.file.sha256.lower(), + super().__init__( + hashes=SampleHashes( + md5=report.target.file.md5.lower(), + sha1=report.target.file.sha1.lower(), + sha256=report.target.file.sha256.lower(), + ) ) - + self.report: CapeReport = report self.global_features = capa.features.extractors.cape.global_.extract_features(self.report) def get_base_address(self) -> Union[AbsoluteVirtualAddress, _NoAddress, None]: @@ -49,9 +48,6 @@ class CapeExtractor(DynamicFeatureExtractor): assert self.report.static is not None and self.report.static.pe is not None return AbsoluteVirtualAddress(self.report.static.pe.imagebase) - def get_sample_hashes(self) -> SampleHashes: - return self.sample_hashes - def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from self.global_features diff --git a/capa/features/extractors/dnfile/extractor.py b/capa/features/extractors/dnfile/extractor.py index 5d34b7cf..f1430fbd 100644 --- a/capa/features/extractors/dnfile/extractor.py +++ b/capa/features/extractors/dnfile/extractor.py @@ -76,9 +76,8 @@ class DnFileFeatureExtractorCache: class DnfileFeatureExtractor(StaticFeatureExtractor): def __init__(self, path: Path): - super().__init__() self.pe: dnfile.dnPE = dnfile.dnPE(str(path)) - self.sample_hashes = SampleHashes.from_bytes(path.read_bytes()) + super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes())) # pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction # most relevant at instruction scope @@ -93,9 +92,6 @@ class DnfileFeatureExtractor(StaticFeatureExtractor): def get_base_address(self): return NO_ADDRESS - def get_sample_hashes(self) -> SampleHashes: - return self.sample_hashes - def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/dnfile_.py b/capa/features/extractors/dnfile_.py index d18c325d..a6cd94c7 100644 --- a/capa/features/extractors/dnfile_.py +++ b/capa/features/extractors/dnfile_.py @@ -83,17 +83,13 @@ GLOBAL_HANDLERS = ( class DnfileFeatureExtractor(StaticFeatureExtractor): def __init__(self, path: Path): - super().__init__() + super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes())) self.path: Path = path self.pe: dnfile.dnPE = dnfile.dnPE(str(path)) - self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self) -> AbsoluteVirtualAddress: return AbsoluteVirtualAddress(0x0) - def get_sample_hashes(self) -> SampleHashes: - return self.sample_hashes - def get_entry_point(self) -> int: # self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT # True: native EP: Token diff --git a/capa/features/extractors/dotnetfile.py b/capa/features/extractors/dotnetfile.py index 70789598..a1c7375f 100644 --- a/capa/features/extractors/dotnetfile.py +++ b/capa/features/extractors/dotnetfile.py @@ -167,17 +167,13 @@ GLOBAL_HANDLERS = ( class DotnetFileFeatureExtractor(StaticFeatureExtractor): def __init__(self, path: Path): - super().__init__() + super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes())) self.path: Path = path self.pe: dnfile.dnPE = dnfile.dnPE(str(path)) - self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self): return NO_ADDRESS - def get_sample_hashes(self) -> SampleHashes: - return self.sample_hashes - def get_entry_point(self) -> int: # self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT # True: native EP: Token diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 23b72f41..e73db2ad 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -30,21 +30,19 @@ from capa.features.extractors.base_extractor import ( class IdaFeatureExtractor(StaticFeatureExtractor): def __init__(self): - super().__init__() + super().__init__( + hashes=SampleHashes( + md5=ida_nalt.retrieve_input_file_md5(), sha1="(unknown)", sha256=ida_nalt.retrieve_input_file_sha256() + ) + ) self.global_features: List[Tuple[Feature, Address]] = [] self.global_features.extend(capa.features.extractors.ida.file.extract_file_format()) self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) - self.sample_hashes = SampleHashes( - md5=ida_nalt.retrieve_input_file_md5(), sha1="(unknown)", sha256=ida_nalt.retrieve_input_file_sha256() - ) def get_base_address(self): return AbsoluteVirtualAddress(idaapi.get_imagebase()) - def get_sample_hashes(self) -> SampleHashes: - return self.sample_hashes - def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/null.py b/capa/features/extractors/null.py index 6a731bee..48798ee1 100644 --- a/capa/features/extractors/null.py +++ b/capa/features/extractors/null.py @@ -16,7 +16,6 @@ from capa.features.extractors.base_extractor import ( BBHandle, CallHandle, InsnHandle, - SampleHashes, ThreadHandle, ProcessHandle, FunctionHandle, @@ -51,7 +50,6 @@ class NullStaticFeatureExtractor(StaticFeatureExtractor): """ base_address: Address - sample_hashes: SampleHashes global_features: List[Feature] file_features: List[Tuple[Address, Feature]] functions: Dict[Address, FunctionFeatures] @@ -63,9 +61,6 @@ class NullStaticFeatureExtractor(StaticFeatureExtractor): for feature in self.global_features: yield feature, NO_ADDRESS - def get_sample_hashes(self) -> SampleHashes: - return self.sample_hashes - def extract_file_features(self): for address, feature in self.file_features: yield feature, address @@ -115,7 +110,6 @@ class ProcessFeatures: @dataclass class NullDynamicFeatureExtractor(DynamicFeatureExtractor): base_address: Address - sample_hashes: SampleHashes global_features: List[Feature] file_features: List[Tuple[Address, Feature]] processes: Dict[Address, ProcessFeatures] @@ -124,9 +118,6 @@ class NullDynamicFeatureExtractor(DynamicFeatureExtractor): for feature in self.global_features: yield feature, NO_ADDRESS - def get_sample_hashes(self) -> SampleHashes: - return self.sample_hashes - def extract_file_features(self): for address, feature in self.file_features: yield feature, address diff --git a/capa/features/extractors/pefile.py b/capa/features/extractors/pefile.py index e7913440..55e0688e 100644 --- a/capa/features/extractors/pefile.py +++ b/capa/features/extractors/pefile.py @@ -187,17 +187,13 @@ GLOBAL_HANDLERS = ( class PefileFeatureExtractor(StaticFeatureExtractor): def __init__(self, path: Path): - super().__init__() + super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes())) self.path: Path = path self.pe = pefile.PE(str(path)) - self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self): return AbsoluteVirtualAddress(self.pe.OPTIONAL_HEADER.ImageBase) - def get_sample_hashes(self) -> SampleHashes: - return self.sample_hashes - def extract_global_features(self): buf = Path(self.path).read_bytes() diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index a4f9c748..86b905c0 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -33,11 +33,10 @@ logger = logging.getLogger(__name__) class VivisectFeatureExtractor(StaticFeatureExtractor): def __init__(self, vw, path: Path, os): - super().__init__() self.vw = vw self.path = path self.buf = path.read_bytes() - self.sample_hashes = SampleHashes.from_bytes(self.buf) + super().__init__(hashes=SampleHashes.from_bytes(self.buf)) # pre-compute these because we'll yield them at *every* scope. self.global_features: List[Tuple[Feature, Address]] = [] @@ -49,9 +48,6 @@ class VivisectFeatureExtractor(StaticFeatureExtractor): # assume there is only one file loaded into the vw return AbsoluteVirtualAddress(list(self.vw.filemeta.values())[0]["imagebase"]) - def get_sample_hashes(self) -> SampleHashes: - return self.sample_hashes - def extract_global_features(self): yield from self.global_features