From af6fe6baa0a93e6d2c468a9f587d59a8ca1a147f Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 4 Sep 2021 15:53:05 -0600 Subject: [PATCH 1/7] extractors: extract global features as their own pseudo scope this means they can be extracted separately in the freeze format. closes #755 --- capa/features/extractors/base_extractor.py | 27 ++++++++++++++++++- capa/features/extractors/ida/extractor.py | 19 +++++-------- capa/features/extractors/pefile.py | 31 +++++++++++++++++++--- capa/features/extractors/smda/extractor.py | 19 +++++-------- capa/features/extractors/viv/extractor.py | 19 +++++-------- capa/features/freeze.py | 15 ++++++++++- capa/main.py | 8 +++--- 7 files changed, 93 insertions(+), 45 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 67ba9451..4b6be1ee 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -58,7 +58,23 @@ class FeatureExtractor: fetch the preferred load address at which the sample was analyzed. """ raise NotImplemented + + @abc.abstractmethod + def extract_global_features(self) -> Iterator[Tuple[Feature, int]]: + """ + extract features found at every scope ("global"). + example:: + + extractor = VivisectFeatureExtractor(vw, path) + for feature, va in extractor.get_global_features(): + print('0x%x: %s', va, feature) + + yields: + Tuple[Feature, int]: feature and its location + """ + raise NotImplemented + @abc.abstractmethod def extract_file_features(self) -> Iterator[Tuple[Feature, int]]: """ @@ -216,6 +232,10 @@ class NullFeatureExtractor(FeatureExtractor): extractor = NullFeatureExtractor({ 'base address: 0x401000, + 'global features': [ + (0x0, capa.features.Arch('i386')), + (0x0, capa.features.OS('linux')), + ], 'file features': [ (0x402345, capa.features.Characteristic('embedded pe')), ], @@ -252,7 +272,12 @@ class NullFeatureExtractor(FeatureExtractor): def get_base_address(self): return self.features["base address"] - + + def extract_global_features(self): + for p in self.features.get("global features", []): + va, feature = p + yield feature, va + def extract_file_features(self): for p in self.features.get("file features", []): va, feature = p diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 7d00c7ef..e1b18b46 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -66,11 +66,12 @@ class IdaFeatureExtractor(FeatureExtractor): def get_base_address(self): return idaapi.get_imagebase() - def extract_file_features(self): - for (feature, ea) in capa.features.extractors.ida.file.extract_features(): - yield feature, ea + def extract_global_features(self): yield from self.global_features + def extract_file_features(self): + yield from capa.features.extractors.ida.file.extract_features() + def get_functions(self): import capa.features.extractors.ida.helpers as ida_helpers @@ -90,9 +91,7 @@ class IdaFeatureExtractor(FeatureExtractor): return FunctionHandle(f) def extract_function_features(self, f): - for (feature, ea) in capa.features.extractors.ida.function.extract_features(f): - yield feature, ea - yield from self.global_features + yield from capa.features.extractors.ida.function.extract_features(f) def get_basic_blocks(self, f): import capa.features.extractors.ida.helpers as ida_helpers @@ -101,9 +100,7 @@ class IdaFeatureExtractor(FeatureExtractor): yield BasicBlockHandle(bb) def extract_basic_block_features(self, f, bb): - for (feature, ea) in capa.features.extractors.ida.basicblock.extract_features(f, bb): - yield feature, ea - yield from self.global_features + yield from capa.features.extractors.ida.basicblock.extract_features(f, bb) def get_instructions(self, f, bb): import capa.features.extractors.ida.helpers as ida_helpers @@ -112,6 +109,4 @@ class IdaFeatureExtractor(FeatureExtractor): yield InstructionHandle(insn) def extract_insn_features(self, f, bb, insn): - for (feature, ea) in capa.features.extractors.ida.insn.extract_features(f, bb, insn): - yield feature, ea - yield from self.global_features + yield from capa.features.extractors.ida.insn.extract_features(f, bb, insn) diff --git a/capa/features/extractors/pefile.py b/capa/features/extractors/pefile.py index 3e5d97fd..a204107c 100644 --- a/capa/features/extractors/pefile.py +++ b/capa/features/extractors/pefile.py @@ -148,8 +148,28 @@ FILE_HANDLERS = ( extract_file_section_names, extract_file_strings, extract_file_function_names, - extract_file_os, extract_file_format, +) + + +def extract_global_features(pe, buf): + """ + extract global features from given workspace + + args: + pe (pefile.PE): the parsed PE + buf: the raw sample bytes + + yields: + Tuple[Feature, VA]: a feature and its location. + """ + for handler in GLOBAL_HANDLERS: + for feature, va in handler(pe=pe, buf=buf): + yield feature, va + + +GLOBAL_HANDLERS = ( + extract_file_os, extract_file_arch, ) @@ -163,12 +183,17 @@ class PefileFeatureExtractor(FeatureExtractor): def get_base_address(self): return self.pe.OPTIONAL_HEADER.ImageBase + def extract_global_features(self): + with open(self.path, "rb") as f: + buf = f.read() + + yield from extract_global_features(self.pe, buf) + def extract_file_features(self): with open(self.path, "rb") as f: buf = f.read() - for feature, va in extract_file_features(self.pe, buf): - yield feature, va + yield from extract_file_features(self.pe, buf) def get_functions(self): raise NotImplementedError("PefileFeatureExtract can only be used to extract file features") diff --git a/capa/features/extractors/smda/extractor.py b/capa/features/extractors/smda/extractor.py index 1a13653b..e3519b6c 100644 --- a/capa/features/extractors/smda/extractor.py +++ b/capa/features/extractors/smda/extractor.py @@ -25,34 +25,29 @@ class SmdaFeatureExtractor(FeatureExtractor): def get_base_address(self): return self.smda_report.base_addr - def extract_file_features(self): - for feature, va in capa.features.extractors.smda.file.extract_features(self.smda_report, self.buf): - yield feature, va + def extract_global_features(self): yield from self.global_features + def extract_file_features(self): + capa.features.extractors.smda.file.extract_features(self.smda_report, self.buf) + def get_functions(self): for function in self.smda_report.getFunctions(): yield function def extract_function_features(self, f): - for feature, va in capa.features.extractors.smda.function.extract_features(f): - yield feature, va - yield from self.global_features + capa.features.extractors.smda.function.extract_features(f) def get_basic_blocks(self, f): for bb in f.getBlocks(): yield bb def extract_basic_block_features(self, f, bb): - for feature, va in capa.features.extractors.smda.basicblock.extract_features(f, bb): - yield feature, va - yield from self.global_features + capa.features.extractors.smda.basicblock.extract_features(f, bb) def get_instructions(self, f, bb): for smda_ins in bb.getInstructions(): yield smda_ins def extract_insn_features(self, f, bb, insn): - for feature, va in capa.features.extractors.smda.insn.extract_features(f, bb, insn): - yield feature, va - yield from self.global_features + capa.features.extractors.smda.insn.extract_features(f, bb, insn) diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index b18d4349..72fcee84 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -51,36 +51,31 @@ class VivisectFeatureExtractor(FeatureExtractor): # assume there is only one file loaded into the vw return list(self.vw.filemeta.values())[0]["imagebase"] - def extract_file_features(self): - for feature, va in capa.features.extractors.viv.file.extract_features(self.vw, self.buf): - yield feature, va + def extract_global_features(self): yield from self.global_features + def extract_file_features(self): + yield from capa.features.extractors.viv.file.extract_features(self.vw, self.buf) + def get_functions(self): for va in sorted(self.vw.getFunctions()): yield viv_utils.Function(self.vw, va) def extract_function_features(self, f): - for feature, va in capa.features.extractors.viv.function.extract_features(f): - yield feature, va - yield from self.global_features + yield from capa.features.extractors.viv.function.extract_features(f) def get_basic_blocks(self, f): return f.basic_blocks def extract_basic_block_features(self, f, bb): - for feature, va in capa.features.extractors.viv.basicblock.extract_features(f, bb): - yield feature, va - yield from self.global_features + yield from capa.features.extractors.viv.basicblock.extract_features(f, bb) def get_instructions(self, f, bb): for insn in bb.instructions: yield InstructionHandle(insn) def extract_insn_features(self, f, bb, insn): - for feature, va in capa.features.extractors.viv.insn.extract_features(f, bb, insn): - yield feature, va - yield from self.global_features + yield from capa.features.extractors.viv.insn.extract_features(f, bb, insn) def is_library_function(self, va): return viv_utils.flirt.is_library_function(self.vw, va) diff --git a/capa/features/freeze.py b/capa/features/freeze.py index 47fbcf3a..42e3d8c9 100644 --- a/capa/features/freeze.py +++ b/capa/features/freeze.py @@ -19,6 +19,10 @@ json format: ... }, 'scopes': { + 'global': [ + (str(name), [any(arg), ...], int(va), ()), + ... + }, 'file': [ (str(name), [any(arg), ...], int(va), ()), ... @@ -52,7 +56,6 @@ See the License for the specific language governing permissions and limitations import json import zlib import logging -import os.path import capa.features.file import capa.features.insn @@ -91,12 +94,15 @@ def dumps(extractor): "base address": extractor.get_base_address(), "functions": {}, "scopes": { + "global": [], "file": [], "function": [], "basic block": [], "instruction": [], }, } + for feature, va in extractor.extract_global_features(): + ret["scopes"]["global"].append(serialize_feature(feature) + (hex(va), ())) for feature, va in extractor.extract_file_features(): ret["scopes"]["file"].append(serialize_feature(feature) + (hex(va), ())) @@ -151,6 +157,7 @@ def loads(s): features = { "base address": doc.get("base address"), + "global features": [], "file features": [], "functions": {}, } @@ -180,6 +187,12 @@ def loads(s): # ('MatchedRule', ('foo', ), '0x401000', ('0x401000', )) # ^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^ ^^^^^^^^^^^^^^ # feature name args addr func/bb/insn + for feature in doc.get("scopes", {}).get("global", []): + va, loc = feature[2:] + va = int(va, 0x10) + feature = deserialize_feature(feature[:2]) + features["global features"].append((va, feature)) + for feature in doc.get("scopes", {}).get("file", []): va, loc = feature[2:] va = int(va, 0x10) diff --git a/capa/main.py b/capa/main.py index 2b91fe26..ef380581 100644 --- a/capa/main.py +++ b/capa/main.py @@ -79,7 +79,7 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: function_features = collections.defaultdict(set) # type: FeatureSet bb_matches = collections.defaultdict(list) # type: MatchResults - for feature, va in extractor.extract_function_features(f): + for feature, va in itertools.chain(extractor.extract_function_features(f), extractor.extract_global_features()): function_features[feature].add(va) for bb in extractor.get_basic_blocks(f): @@ -88,12 +88,12 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: # - basic blocks bb_features = collections.defaultdict(set) - for feature, va in extractor.extract_basic_block_features(f, bb): + for feature, va in itertools.chain(extractor.extract_basic_block_features(f, bb), extractor.extract_global_features()): bb_features[feature].add(va) function_features[feature].add(va) for insn in extractor.get_instructions(f, bb): - for feature, va in extractor.extract_insn_features(f, bb, insn): + for feature, va in itertools.chain(extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features()): bb_features[feature].add(va) function_features[feature].add(va) @@ -112,7 +112,7 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet): file_features = collections.defaultdict(set) # type: FeatureSet - for feature, va in extractor.extract_file_features(): + for feature, va in itertools.chain(extractor.extract_file_features(), extractor.extract_global_features()): # not all file features may have virtual addresses. # if not, then at least ensure the feature shows up in the index. # the set of addresses will still be empty. From b2590e7c9a83b4f483861dcfab312a456df26b00 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 4 Sep 2021 15:55:28 -0600 Subject: [PATCH 2/7] changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e1821d3f..45b27aa7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ ### Breaking Changes - legacy term `arch` (i.e., "x32") is now called `bitness` @williballenthin +- freeze format gains new section for "global" features #759 @williballenthin + ### New Rules (78) From 2ea4dc9d7e18a0d9af120e368372b8d99fb91aa1 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 4 Sep 2021 15:58:32 -0600 Subject: [PATCH 3/7] tests: fixtures: extract global features at each scope --- tests/fixtures.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index cf1937da..8283502d 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -10,6 +10,7 @@ import os import os.path import binascii +import itertools import contextlib import collections from functools import lru_cache @@ -133,6 +134,13 @@ def get_pefile_extractor(path): return capa.features.extractors.pefile.PefileFeatureExtractor(path) +def extract_global_features(extractor): + features = collections.defaultdict(set) + for feature, va in extractor.extract_global_features(): + features[feature].add(va) + return features + + @lru_cache() def extract_file_features(extractor): features = collections.defaultdict(set) @@ -288,7 +296,7 @@ def resolve_scope(scope): if scope == "file": def inner_file(extractor): - return extract_file_features(extractor) + return itertools.chain(extract_file_features(extractor), extract_global_scope(extractor)) inner_file.__name__ = scope return inner_file @@ -301,7 +309,7 @@ def resolve_scope(scope): def inner_bb(extractor): f = get_function(extractor, fva) bb = get_basic_block(extractor, f, bbva) - return extract_basic_block_features(extractor, f, bb) + return itertools.chain(extract_basic_block_features(extractor, f, bb), extract_global_scope(extractor)) inner_bb.__name__ = scope return inner_bb @@ -311,7 +319,7 @@ def resolve_scope(scope): def inner_function(extractor): f = get_function(extractor, va) - return extract_function_features(extractor, f) + return itertools.chain(extract_function_features(extractor, f), extract_global_scope(extractor)) inner_function.__name__ = scope return inner_function From 39e4e4776364cd7dbe92627d89f49fbd16587c60 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 4 Sep 2021 15:59:38 -0600 Subject: [PATCH 4/7] pep8 --- capa/features/extractors/base_extractor.py | 8 ++++---- capa/features/extractors/pefile.py | 2 +- capa/features/freeze.py | 2 +- capa/main.py | 8 ++++++-- tests/fixtures.py | 2 +- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 4b6be1ee..c287e75b 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -58,7 +58,7 @@ class FeatureExtractor: fetch the preferred load address at which the sample was analyzed. """ raise NotImplemented - + @abc.abstractmethod def extract_global_features(self) -> Iterator[Tuple[Feature, int]]: """ @@ -74,7 +74,7 @@ class FeatureExtractor: Tuple[Feature, int]: feature and its location """ raise NotImplemented - + @abc.abstractmethod def extract_file_features(self) -> Iterator[Tuple[Feature, int]]: """ @@ -272,12 +272,12 @@ class NullFeatureExtractor(FeatureExtractor): def get_base_address(self): return self.features["base address"] - + def extract_global_features(self): for p in self.features.get("global features", []): va, feature = p yield feature, va - + def extract_file_features(self): for p in self.features.get("file features", []): va, feature = p diff --git a/capa/features/extractors/pefile.py b/capa/features/extractors/pefile.py index a204107c..8f6a3ed1 100644 --- a/capa/features/extractors/pefile.py +++ b/capa/features/extractors/pefile.py @@ -150,7 +150,7 @@ FILE_HANDLERS = ( extract_file_function_names, extract_file_format, ) - + def extract_global_features(pe, buf): """ diff --git a/capa/features/freeze.py b/capa/features/freeze.py index 42e3d8c9..9c0d0c98 100644 --- a/capa/features/freeze.py +++ b/capa/features/freeze.py @@ -192,7 +192,7 @@ def loads(s): va = int(va, 0x10) feature = deserialize_feature(feature[:2]) features["global features"].append((va, feature)) - + for feature in doc.get("scopes", {}).get("file", []): va, loc = feature[2:] va = int(va, 0x10) diff --git a/capa/main.py b/capa/main.py index ef380581..a7e1fb8a 100644 --- a/capa/main.py +++ b/capa/main.py @@ -88,12 +88,16 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: # - basic blocks bb_features = collections.defaultdict(set) - for feature, va in itertools.chain(extractor.extract_basic_block_features(f, bb), extractor.extract_global_features()): + for feature, va in itertools.chain( + extractor.extract_basic_block_features(f, bb), extractor.extract_global_features() + ): bb_features[feature].add(va) function_features[feature].add(va) for insn in extractor.get_instructions(f, bb): - for feature, va in itertools.chain(extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features()): + for feature, va in itertools.chain( + extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features() + ): bb_features[feature].add(va) function_features[feature].add(va) diff --git a/tests/fixtures.py b/tests/fixtures.py index 8283502d..1b404bb2 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -139,7 +139,7 @@ def extract_global_features(extractor): for feature, va in extractor.extract_global_features(): features[feature].add(va) return features - + @lru_cache() def extract_file_features(extractor): From 122fdc69e3702c3934d584d04768d69b58f02a9f Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 4 Sep 2021 16:00:49 -0600 Subject: [PATCH 5/7] fixtures: name error --- tests/fixtures.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 1b404bb2..e99c5cf9 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -296,7 +296,7 @@ def resolve_scope(scope): if scope == "file": def inner_file(extractor): - return itertools.chain(extract_file_features(extractor), extract_global_scope(extractor)) + return itertools.chain(extract_file_features(extractor), extract_global_features(extractor)) inner_file.__name__ = scope return inner_file @@ -309,7 +309,7 @@ def resolve_scope(scope): def inner_bb(extractor): f = get_function(extractor, fva) bb = get_basic_block(extractor, f, bbva) - return itertools.chain(extract_basic_block_features(extractor, f, bb), extract_global_scope(extractor)) + return itertools.chain(extract_basic_block_features(extractor, f, bb), extract_global_features(extractor)) inner_bb.__name__ = scope return inner_bb @@ -319,7 +319,7 @@ def resolve_scope(scope): def inner_function(extractor): f = get_function(extractor, va) - return itertools.chain(extract_function_features(extractor, f), extract_global_scope(extractor)) + return itertools.chain(extract_function_features(extractor, f), extract_global_features(extractor)) inner_function.__name__ = scope return inner_function From 32244b2641ef79189422b56c76757079c4180042 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 4 Sep 2021 16:12:51 -0600 Subject: [PATCH 6/7] fixtures: fix extraction of global features --- tests/fixtures.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index e99c5cf9..47c23371 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -296,7 +296,10 @@ def resolve_scope(scope): if scope == "file": def inner_file(extractor): - return itertools.chain(extract_file_features(extractor), extract_global_features(extractor)) + features = extract_file_features(extractor) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features inner_file.__name__ = scope return inner_file @@ -309,7 +312,10 @@ def resolve_scope(scope): def inner_bb(extractor): f = get_function(extractor, fva) bb = get_basic_block(extractor, f, bbva) - return itertools.chain(extract_basic_block_features(extractor, f, bb), extract_global_features(extractor)) + features = extract_basic_block_features(extractor, f, bb) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features inner_bb.__name__ = scope return inner_bb @@ -319,7 +325,10 @@ def resolve_scope(scope): def inner_function(extractor): f = get_function(extractor, va) - return itertools.chain(extract_function_features(extractor, f), extract_global_features(extractor)) + features = extract_function_features(extractor, f) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features inner_function.__name__ = scope return inner_function From 7a3287fa25eb0e9bd22b31cc6b828bf8fc367c46 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 4 Sep 2021 16:55:37 -0600 Subject: [PATCH 7/7] extractors: smda: fix missing `yield from` --- capa/features/extractors/smda/extractor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/capa/features/extractors/smda/extractor.py b/capa/features/extractors/smda/extractor.py index e3519b6c..56788e43 100644 --- a/capa/features/extractors/smda/extractor.py +++ b/capa/features/extractors/smda/extractor.py @@ -29,25 +29,25 @@ class SmdaFeatureExtractor(FeatureExtractor): yield from self.global_features def extract_file_features(self): - capa.features.extractors.smda.file.extract_features(self.smda_report, self.buf) + yield from capa.features.extractors.smda.file.extract_features(self.smda_report, self.buf) def get_functions(self): for function in self.smda_report.getFunctions(): yield function def extract_function_features(self, f): - capa.features.extractors.smda.function.extract_features(f) + yield from capa.features.extractors.smda.function.extract_features(f) def get_basic_blocks(self, f): for bb in f.getBlocks(): yield bb def extract_basic_block_features(self, f, bb): - capa.features.extractors.smda.basicblock.extract_features(f, bb) + yield from capa.features.extractors.smda.basicblock.extract_features(f, bb) def get_instructions(self, f, bb): for smda_ins in bb.getInstructions(): yield smda_ins def extract_insn_features(self, f, bb, insn): - capa.features.extractors.smda.insn.extract_features(f, bb, insn) + yield from capa.features.extractors.smda.insn.extract_features(f, bb, insn)