diff --git a/CHANGELOG.md b/CHANGELOG.md index e1821d3f..45b27aa7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ ### Breaking Changes - legacy term `arch` (i.e., "x32") is now called `bitness` @williballenthin +- freeze format gains new section for "global" features #759 @williballenthin + ### New Rules (78) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 67ba9451..c287e75b 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -59,6 +59,22 @@ class FeatureExtractor: """ raise NotImplemented + @abc.abstractmethod + def extract_global_features(self) -> Iterator[Tuple[Feature, int]]: + """ + extract features found at every scope ("global"). + + example:: + + extractor = VivisectFeatureExtractor(vw, path) + for feature, va in extractor.get_global_features(): + print('0x%x: %s', va, feature) + + yields: + Tuple[Feature, int]: feature and its location + """ + raise NotImplemented + @abc.abstractmethod def extract_file_features(self) -> Iterator[Tuple[Feature, int]]: """ @@ -216,6 +232,10 @@ class NullFeatureExtractor(FeatureExtractor): extractor = NullFeatureExtractor({ 'base address: 0x401000, + 'global features': [ + (0x0, capa.features.Arch('i386')), + (0x0, capa.features.OS('linux')), + ], 'file features': [ (0x402345, capa.features.Characteristic('embedded pe')), ], @@ -253,6 +273,11 @@ class NullFeatureExtractor(FeatureExtractor): def get_base_address(self): return self.features["base address"] + def extract_global_features(self): + for p in self.features.get("global features", []): + va, feature = p + yield feature, va + def extract_file_features(self): for p in self.features.get("file features", []): va, feature = p diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 7d00c7ef..e1b18b46 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -66,11 +66,12 @@ class IdaFeatureExtractor(FeatureExtractor): def get_base_address(self): return idaapi.get_imagebase() - def extract_file_features(self): - for (feature, ea) in capa.features.extractors.ida.file.extract_features(): - yield feature, ea + def extract_global_features(self): yield from self.global_features + def extract_file_features(self): + yield from capa.features.extractors.ida.file.extract_features() + def get_functions(self): import capa.features.extractors.ida.helpers as ida_helpers @@ -90,9 +91,7 @@ class IdaFeatureExtractor(FeatureExtractor): return FunctionHandle(f) def extract_function_features(self, f): - for (feature, ea) in capa.features.extractors.ida.function.extract_features(f): - yield feature, ea - yield from self.global_features + yield from capa.features.extractors.ida.function.extract_features(f) def get_basic_blocks(self, f): import capa.features.extractors.ida.helpers as ida_helpers @@ -101,9 +100,7 @@ class IdaFeatureExtractor(FeatureExtractor): yield BasicBlockHandle(bb) def extract_basic_block_features(self, f, bb): - for (feature, ea) in capa.features.extractors.ida.basicblock.extract_features(f, bb): - yield feature, ea - yield from self.global_features + yield from capa.features.extractors.ida.basicblock.extract_features(f, bb) def get_instructions(self, f, bb): import capa.features.extractors.ida.helpers as ida_helpers @@ -112,6 +109,4 @@ class IdaFeatureExtractor(FeatureExtractor): yield InstructionHandle(insn) def extract_insn_features(self, f, bb, insn): - for (feature, ea) in capa.features.extractors.ida.insn.extract_features(f, bb, insn): - yield feature, ea - yield from self.global_features + yield from capa.features.extractors.ida.insn.extract_features(f, bb, insn) diff --git a/capa/features/extractors/pefile.py b/capa/features/extractors/pefile.py index 3e5d97fd..8f6a3ed1 100644 --- a/capa/features/extractors/pefile.py +++ b/capa/features/extractors/pefile.py @@ -148,8 +148,28 @@ FILE_HANDLERS = ( extract_file_section_names, extract_file_strings, extract_file_function_names, - extract_file_os, extract_file_format, +) + + +def extract_global_features(pe, buf): + """ + extract global features from given workspace + + args: + pe (pefile.PE): the parsed PE + buf: the raw sample bytes + + yields: + Tuple[Feature, VA]: a feature and its location. + """ + for handler in GLOBAL_HANDLERS: + for feature, va in handler(pe=pe, buf=buf): + yield feature, va + + +GLOBAL_HANDLERS = ( + extract_file_os, extract_file_arch, ) @@ -163,12 +183,17 @@ class PefileFeatureExtractor(FeatureExtractor): def get_base_address(self): return self.pe.OPTIONAL_HEADER.ImageBase + def extract_global_features(self): + with open(self.path, "rb") as f: + buf = f.read() + + yield from extract_global_features(self.pe, buf) + def extract_file_features(self): with open(self.path, "rb") as f: buf = f.read() - for feature, va in extract_file_features(self.pe, buf): - yield feature, va + yield from extract_file_features(self.pe, buf) def get_functions(self): raise NotImplementedError("PefileFeatureExtract can only be used to extract file features") diff --git a/capa/features/extractors/smda/extractor.py b/capa/features/extractors/smda/extractor.py index 1a13653b..56788e43 100644 --- a/capa/features/extractors/smda/extractor.py +++ b/capa/features/extractors/smda/extractor.py @@ -25,34 +25,29 @@ class SmdaFeatureExtractor(FeatureExtractor): def get_base_address(self): return self.smda_report.base_addr - def extract_file_features(self): - for feature, va in capa.features.extractors.smda.file.extract_features(self.smda_report, self.buf): - yield feature, va + def extract_global_features(self): yield from self.global_features + def extract_file_features(self): + yield from capa.features.extractors.smda.file.extract_features(self.smda_report, self.buf) + def get_functions(self): for function in self.smda_report.getFunctions(): yield function def extract_function_features(self, f): - for feature, va in capa.features.extractors.smda.function.extract_features(f): - yield feature, va - yield from self.global_features + yield from capa.features.extractors.smda.function.extract_features(f) def get_basic_blocks(self, f): for bb in f.getBlocks(): yield bb def extract_basic_block_features(self, f, bb): - for feature, va in capa.features.extractors.smda.basicblock.extract_features(f, bb): - yield feature, va - yield from self.global_features + yield from capa.features.extractors.smda.basicblock.extract_features(f, bb) def get_instructions(self, f, bb): for smda_ins in bb.getInstructions(): yield smda_ins def extract_insn_features(self, f, bb, insn): - for feature, va in capa.features.extractors.smda.insn.extract_features(f, bb, insn): - yield feature, va - yield from self.global_features + yield from capa.features.extractors.smda.insn.extract_features(f, bb, insn) diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index b18d4349..72fcee84 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -51,36 +51,31 @@ class VivisectFeatureExtractor(FeatureExtractor): # assume there is only one file loaded into the vw return list(self.vw.filemeta.values())[0]["imagebase"] - def extract_file_features(self): - for feature, va in capa.features.extractors.viv.file.extract_features(self.vw, self.buf): - yield feature, va + def extract_global_features(self): yield from self.global_features + def extract_file_features(self): + yield from capa.features.extractors.viv.file.extract_features(self.vw, self.buf) + def get_functions(self): for va in sorted(self.vw.getFunctions()): yield viv_utils.Function(self.vw, va) def extract_function_features(self, f): - for feature, va in capa.features.extractors.viv.function.extract_features(f): - yield feature, va - yield from self.global_features + yield from capa.features.extractors.viv.function.extract_features(f) def get_basic_blocks(self, f): return f.basic_blocks def extract_basic_block_features(self, f, bb): - for feature, va in capa.features.extractors.viv.basicblock.extract_features(f, bb): - yield feature, va - yield from self.global_features + yield from capa.features.extractors.viv.basicblock.extract_features(f, bb) def get_instructions(self, f, bb): for insn in bb.instructions: yield InstructionHandle(insn) def extract_insn_features(self, f, bb, insn): - for feature, va in capa.features.extractors.viv.insn.extract_features(f, bb, insn): - yield feature, va - yield from self.global_features + yield from capa.features.extractors.viv.insn.extract_features(f, bb, insn) def is_library_function(self, va): return viv_utils.flirt.is_library_function(self.vw, va) diff --git a/capa/features/freeze.py b/capa/features/freeze.py index 47fbcf3a..9c0d0c98 100644 --- a/capa/features/freeze.py +++ b/capa/features/freeze.py @@ -19,6 +19,10 @@ json format: ... }, 'scopes': { + 'global': [ + (str(name), [any(arg), ...], int(va), ()), + ... + }, 'file': [ (str(name), [any(arg), ...], int(va), ()), ... @@ -52,7 +56,6 @@ See the License for the specific language governing permissions and limitations import json import zlib import logging -import os.path import capa.features.file import capa.features.insn @@ -91,12 +94,15 @@ def dumps(extractor): "base address": extractor.get_base_address(), "functions": {}, "scopes": { + "global": [], "file": [], "function": [], "basic block": [], "instruction": [], }, } + for feature, va in extractor.extract_global_features(): + ret["scopes"]["global"].append(serialize_feature(feature) + (hex(va), ())) for feature, va in extractor.extract_file_features(): ret["scopes"]["file"].append(serialize_feature(feature) + (hex(va), ())) @@ -151,6 +157,7 @@ def loads(s): features = { "base address": doc.get("base address"), + "global features": [], "file features": [], "functions": {}, } @@ -180,6 +187,12 @@ def loads(s): # ('MatchedRule', ('foo', ), '0x401000', ('0x401000', )) # ^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^ ^^^^^^^^^^^^^^ # feature name args addr func/bb/insn + for feature in doc.get("scopes", {}).get("global", []): + va, loc = feature[2:] + va = int(va, 0x10) + feature = deserialize_feature(feature[:2]) + features["global features"].append((va, feature)) + for feature in doc.get("scopes", {}).get("file", []): va, loc = feature[2:] va = int(va, 0x10) diff --git a/capa/main.py b/capa/main.py index 2b91fe26..a7e1fb8a 100644 --- a/capa/main.py +++ b/capa/main.py @@ -79,7 +79,7 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: function_features = collections.defaultdict(set) # type: FeatureSet bb_matches = collections.defaultdict(list) # type: MatchResults - for feature, va in extractor.extract_function_features(f): + for feature, va in itertools.chain(extractor.extract_function_features(f), extractor.extract_global_features()): function_features[feature].add(va) for bb in extractor.get_basic_blocks(f): @@ -88,12 +88,16 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: # - basic blocks bb_features = collections.defaultdict(set) - for feature, va in extractor.extract_basic_block_features(f, bb): + for feature, va in itertools.chain( + extractor.extract_basic_block_features(f, bb), extractor.extract_global_features() + ): bb_features[feature].add(va) function_features[feature].add(va) for insn in extractor.get_instructions(f, bb): - for feature, va in extractor.extract_insn_features(f, bb, insn): + for feature, va in itertools.chain( + extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features() + ): bb_features[feature].add(va) function_features[feature].add(va) @@ -112,7 +116,7 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet): file_features = collections.defaultdict(set) # type: FeatureSet - for feature, va in extractor.extract_file_features(): + for feature, va in itertools.chain(extractor.extract_file_features(), extractor.extract_global_features()): # not all file features may have virtual addresses. # if not, then at least ensure the feature shows up in the index. # the set of addresses will still be empty. diff --git a/tests/fixtures.py b/tests/fixtures.py index cf1937da..47c23371 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -10,6 +10,7 @@ import os import os.path import binascii +import itertools import contextlib import collections from functools import lru_cache @@ -133,6 +134,13 @@ def get_pefile_extractor(path): return capa.features.extractors.pefile.PefileFeatureExtractor(path) +def extract_global_features(extractor): + features = collections.defaultdict(set) + for feature, va in extractor.extract_global_features(): + features[feature].add(va) + return features + + @lru_cache() def extract_file_features(extractor): features = collections.defaultdict(set) @@ -288,7 +296,10 @@ def resolve_scope(scope): if scope == "file": def inner_file(extractor): - return extract_file_features(extractor) + features = extract_file_features(extractor) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features inner_file.__name__ = scope return inner_file @@ -301,7 +312,10 @@ def resolve_scope(scope): def inner_bb(extractor): f = get_function(extractor, fva) bb = get_basic_block(extractor, f, bbva) - return extract_basic_block_features(extractor, f, bb) + features = extract_basic_block_features(extractor, f, bb) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features inner_bb.__name__ = scope return inner_bb @@ -311,7 +325,10 @@ def resolve_scope(scope): def inner_function(extractor): f = get_function(extractor, va) - return extract_function_features(extractor, f) + features = extract_function_features(extractor, f) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features inner_function.__name__ = scope return inner_function