Merge pull request #759 from fireeye/fix-755

extractors: extract global features as their own pseudo scope
This commit is contained in:
Willi Ballenthin
2021-09-09 20:16:48 -06:00
committed by GitHub
9 changed files with 118 additions and 47 deletions

View File

@@ -14,6 +14,8 @@
### Breaking Changes
- legacy term `arch` (i.e., "x32") is now called `bitness` @williballenthin
- freeze format gains new section for "global" features #759 @williballenthin
### New Rules (78)

View File

@@ -59,6 +59,22 @@ class FeatureExtractor:
"""
raise NotImplemented
@abc.abstractmethod
def extract_global_features(self) -> Iterator[Tuple[Feature, int]]:
"""
extract features found at every scope ("global").
example::
extractor = VivisectFeatureExtractor(vw, path)
for feature, va in extractor.get_global_features():
print('0x%x: %s', va, feature)
yields:
Tuple[Feature, int]: feature and its location
"""
raise NotImplemented
@abc.abstractmethod
def extract_file_features(self) -> Iterator[Tuple[Feature, int]]:
"""
@@ -216,6 +232,10 @@ class NullFeatureExtractor(FeatureExtractor):
extractor = NullFeatureExtractor({
'base address: 0x401000,
'global features': [
(0x0, capa.features.Arch('i386')),
(0x0, capa.features.OS('linux')),
],
'file features': [
(0x402345, capa.features.Characteristic('embedded pe')),
],
@@ -253,6 +273,11 @@ class NullFeatureExtractor(FeatureExtractor):
def get_base_address(self):
return self.features["base address"]
def extract_global_features(self):
for p in self.features.get("global features", []):
va, feature = p
yield feature, va
def extract_file_features(self):
for p in self.features.get("file features", []):
va, feature = p

View File

@@ -66,11 +66,12 @@ class IdaFeatureExtractor(FeatureExtractor):
def get_base_address(self):
return idaapi.get_imagebase()
def extract_file_features(self):
for (feature, ea) in capa.features.extractors.ida.file.extract_features():
yield feature, ea
def extract_global_features(self):
yield from self.global_features
def extract_file_features(self):
yield from capa.features.extractors.ida.file.extract_features()
def get_functions(self):
import capa.features.extractors.ida.helpers as ida_helpers
@@ -90,9 +91,7 @@ class IdaFeatureExtractor(FeatureExtractor):
return FunctionHandle(f)
def extract_function_features(self, f):
for (feature, ea) in capa.features.extractors.ida.function.extract_features(f):
yield feature, ea
yield from self.global_features
yield from capa.features.extractors.ida.function.extract_features(f)
def get_basic_blocks(self, f):
import capa.features.extractors.ida.helpers as ida_helpers
@@ -101,9 +100,7 @@ class IdaFeatureExtractor(FeatureExtractor):
yield BasicBlockHandle(bb)
def extract_basic_block_features(self, f, bb):
for (feature, ea) in capa.features.extractors.ida.basicblock.extract_features(f, bb):
yield feature, ea
yield from self.global_features
yield from capa.features.extractors.ida.basicblock.extract_features(f, bb)
def get_instructions(self, f, bb):
import capa.features.extractors.ida.helpers as ida_helpers
@@ -112,6 +109,4 @@ class IdaFeatureExtractor(FeatureExtractor):
yield InstructionHandle(insn)
def extract_insn_features(self, f, bb, insn):
for (feature, ea) in capa.features.extractors.ida.insn.extract_features(f, bb, insn):
yield feature, ea
yield from self.global_features
yield from capa.features.extractors.ida.insn.extract_features(f, bb, insn)

View File

@@ -148,8 +148,28 @@ FILE_HANDLERS = (
extract_file_section_names,
extract_file_strings,
extract_file_function_names,
extract_file_os,
extract_file_format,
)
def extract_global_features(pe, buf):
"""
extract global features from given workspace
args:
pe (pefile.PE): the parsed PE
buf: the raw sample bytes
yields:
Tuple[Feature, VA]: a feature and its location.
"""
for handler in GLOBAL_HANDLERS:
for feature, va in handler(pe=pe, buf=buf):
yield feature, va
GLOBAL_HANDLERS = (
extract_file_os,
extract_file_arch,
)
@@ -163,12 +183,17 @@ class PefileFeatureExtractor(FeatureExtractor):
def get_base_address(self):
return self.pe.OPTIONAL_HEADER.ImageBase
def extract_global_features(self):
with open(self.path, "rb") as f:
buf = f.read()
yield from extract_global_features(self.pe, buf)
def extract_file_features(self):
with open(self.path, "rb") as f:
buf = f.read()
for feature, va in extract_file_features(self.pe, buf):
yield feature, va
yield from extract_file_features(self.pe, buf)
def get_functions(self):
raise NotImplementedError("PefileFeatureExtract can only be used to extract file features")

View File

@@ -25,34 +25,29 @@ class SmdaFeatureExtractor(FeatureExtractor):
def get_base_address(self):
return self.smda_report.base_addr
def extract_file_features(self):
for feature, va in capa.features.extractors.smda.file.extract_features(self.smda_report, self.buf):
yield feature, va
def extract_global_features(self):
yield from self.global_features
def extract_file_features(self):
yield from capa.features.extractors.smda.file.extract_features(self.smda_report, self.buf)
def get_functions(self):
for function in self.smda_report.getFunctions():
yield function
def extract_function_features(self, f):
for feature, va in capa.features.extractors.smda.function.extract_features(f):
yield feature, va
yield from self.global_features
yield from capa.features.extractors.smda.function.extract_features(f)
def get_basic_blocks(self, f):
for bb in f.getBlocks():
yield bb
def extract_basic_block_features(self, f, bb):
for feature, va in capa.features.extractors.smda.basicblock.extract_features(f, bb):
yield feature, va
yield from self.global_features
yield from capa.features.extractors.smda.basicblock.extract_features(f, bb)
def get_instructions(self, f, bb):
for smda_ins in bb.getInstructions():
yield smda_ins
def extract_insn_features(self, f, bb, insn):
for feature, va in capa.features.extractors.smda.insn.extract_features(f, bb, insn):
yield feature, va
yield from self.global_features
yield from capa.features.extractors.smda.insn.extract_features(f, bb, insn)

View File

@@ -51,36 +51,31 @@ class VivisectFeatureExtractor(FeatureExtractor):
# assume there is only one file loaded into the vw
return list(self.vw.filemeta.values())[0]["imagebase"]
def extract_file_features(self):
for feature, va in capa.features.extractors.viv.file.extract_features(self.vw, self.buf):
yield feature, va
def extract_global_features(self):
yield from self.global_features
def extract_file_features(self):
yield from capa.features.extractors.viv.file.extract_features(self.vw, self.buf)
def get_functions(self):
for va in sorted(self.vw.getFunctions()):
yield viv_utils.Function(self.vw, va)
def extract_function_features(self, f):
for feature, va in capa.features.extractors.viv.function.extract_features(f):
yield feature, va
yield from self.global_features
yield from capa.features.extractors.viv.function.extract_features(f)
def get_basic_blocks(self, f):
return f.basic_blocks
def extract_basic_block_features(self, f, bb):
for feature, va in capa.features.extractors.viv.basicblock.extract_features(f, bb):
yield feature, va
yield from self.global_features
yield from capa.features.extractors.viv.basicblock.extract_features(f, bb)
def get_instructions(self, f, bb):
for insn in bb.instructions:
yield InstructionHandle(insn)
def extract_insn_features(self, f, bb, insn):
for feature, va in capa.features.extractors.viv.insn.extract_features(f, bb, insn):
yield feature, va
yield from self.global_features
yield from capa.features.extractors.viv.insn.extract_features(f, bb, insn)
def is_library_function(self, va):
return viv_utils.flirt.is_library_function(self.vw, va)

View File

@@ -19,6 +19,10 @@ json format:
...
},
'scopes': {
'global': [
(str(name), [any(arg), ...], int(va), ()),
...
},
'file': [
(str(name), [any(arg), ...], int(va), ()),
...
@@ -52,7 +56,6 @@ See the License for the specific language governing permissions and limitations
import json
import zlib
import logging
import os.path
import capa.features.file
import capa.features.insn
@@ -91,12 +94,15 @@ def dumps(extractor):
"base address": extractor.get_base_address(),
"functions": {},
"scopes": {
"global": [],
"file": [],
"function": [],
"basic block": [],
"instruction": [],
},
}
for feature, va in extractor.extract_global_features():
ret["scopes"]["global"].append(serialize_feature(feature) + (hex(va), ()))
for feature, va in extractor.extract_file_features():
ret["scopes"]["file"].append(serialize_feature(feature) + (hex(va), ()))
@@ -151,6 +157,7 @@ def loads(s):
features = {
"base address": doc.get("base address"),
"global features": [],
"file features": [],
"functions": {},
}
@@ -180,6 +187,12 @@ def loads(s):
# ('MatchedRule', ('foo', ), '0x401000', ('0x401000', ))
# ^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^ ^^^^^^^^^^^^^^
# feature name args addr func/bb/insn
for feature in doc.get("scopes", {}).get("global", []):
va, loc = feature[2:]
va = int(va, 0x10)
feature = deserialize_feature(feature[:2])
features["global features"].append((va, feature))
for feature in doc.get("scopes", {}).get("file", []):
va, loc = feature[2:]
va = int(va, 0x10)

View File

@@ -79,7 +79,7 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f:
function_features = collections.defaultdict(set) # type: FeatureSet
bb_matches = collections.defaultdict(list) # type: MatchResults
for feature, va in extractor.extract_function_features(f):
for feature, va in itertools.chain(extractor.extract_function_features(f), extractor.extract_global_features()):
function_features[feature].add(va)
for bb in extractor.get_basic_blocks(f):
@@ -88,12 +88,16 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f:
# - basic blocks
bb_features = collections.defaultdict(set)
for feature, va in extractor.extract_basic_block_features(f, bb):
for feature, va in itertools.chain(
extractor.extract_basic_block_features(f, bb), extractor.extract_global_features()
):
bb_features[feature].add(va)
function_features[feature].add(va)
for insn in extractor.get_instructions(f, bb):
for feature, va in extractor.extract_insn_features(f, bb, insn):
for feature, va in itertools.chain(
extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features()
):
bb_features[feature].add(va)
function_features[feature].add(va)
@@ -112,7 +116,7 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f:
def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet):
file_features = collections.defaultdict(set) # type: FeatureSet
for feature, va in extractor.extract_file_features():
for feature, va in itertools.chain(extractor.extract_file_features(), extractor.extract_global_features()):
# not all file features may have virtual addresses.
# if not, then at least ensure the feature shows up in the index.
# the set of addresses will still be empty.

View File

@@ -10,6 +10,7 @@
import os
import os.path
import binascii
import itertools
import contextlib
import collections
from functools import lru_cache
@@ -133,6 +134,13 @@ def get_pefile_extractor(path):
return capa.features.extractors.pefile.PefileFeatureExtractor(path)
def extract_global_features(extractor):
features = collections.defaultdict(set)
for feature, va in extractor.extract_global_features():
features[feature].add(va)
return features
@lru_cache()
def extract_file_features(extractor):
features = collections.defaultdict(set)
@@ -288,7 +296,10 @@ def resolve_scope(scope):
if scope == "file":
def inner_file(extractor):
return extract_file_features(extractor)
features = extract_file_features(extractor)
for k, vs in extract_global_features(extractor).items():
features[k].update(vs)
return features
inner_file.__name__ = scope
return inner_file
@@ -301,7 +312,10 @@ def resolve_scope(scope):
def inner_bb(extractor):
f = get_function(extractor, fva)
bb = get_basic_block(extractor, f, bbva)
return extract_basic_block_features(extractor, f, bb)
features = extract_basic_block_features(extractor, f, bb)
for k, vs in extract_global_features(extractor).items():
features[k].update(vs)
return features
inner_bb.__name__ = scope
return inner_bb
@@ -311,7 +325,10 @@ def resolve_scope(scope):
def inner_function(extractor):
f = get_function(extractor, va)
return extract_function_features(extractor, f)
features = extract_function_features(extractor, f)
for k, vs in extract_global_features(extractor).items():
features[k].update(vs)
return features
inner_function.__name__ = scope
return inner_function