mirror of
https://github.com/mandiant/capa.git
synced 2026-02-04 11:07:53 -08:00
Merge pull request #759 from fireeye/fix-755
extractors: extract global features as their own pseudo scope
This commit is contained in:
@@ -14,6 +14,8 @@
|
||||
### Breaking Changes
|
||||
|
||||
- legacy term `arch` (i.e., "x32") is now called `bitness` @williballenthin
|
||||
- freeze format gains new section for "global" features #759 @williballenthin
|
||||
|
||||
|
||||
### New Rules (78)
|
||||
|
||||
|
||||
@@ -59,6 +59,22 @@ class FeatureExtractor:
|
||||
"""
|
||||
raise NotImplemented
|
||||
|
||||
@abc.abstractmethod
|
||||
def extract_global_features(self) -> Iterator[Tuple[Feature, int]]:
|
||||
"""
|
||||
extract features found at every scope ("global").
|
||||
|
||||
example::
|
||||
|
||||
extractor = VivisectFeatureExtractor(vw, path)
|
||||
for feature, va in extractor.get_global_features():
|
||||
print('0x%x: %s', va, feature)
|
||||
|
||||
yields:
|
||||
Tuple[Feature, int]: feature and its location
|
||||
"""
|
||||
raise NotImplemented
|
||||
|
||||
@abc.abstractmethod
|
||||
def extract_file_features(self) -> Iterator[Tuple[Feature, int]]:
|
||||
"""
|
||||
@@ -216,6 +232,10 @@ class NullFeatureExtractor(FeatureExtractor):
|
||||
|
||||
extractor = NullFeatureExtractor({
|
||||
'base address: 0x401000,
|
||||
'global features': [
|
||||
(0x0, capa.features.Arch('i386')),
|
||||
(0x0, capa.features.OS('linux')),
|
||||
],
|
||||
'file features': [
|
||||
(0x402345, capa.features.Characteristic('embedded pe')),
|
||||
],
|
||||
@@ -253,6 +273,11 @@ class NullFeatureExtractor(FeatureExtractor):
|
||||
def get_base_address(self):
|
||||
return self.features["base address"]
|
||||
|
||||
def extract_global_features(self):
|
||||
for p in self.features.get("global features", []):
|
||||
va, feature = p
|
||||
yield feature, va
|
||||
|
||||
def extract_file_features(self):
|
||||
for p in self.features.get("file features", []):
|
||||
va, feature = p
|
||||
|
||||
@@ -66,11 +66,12 @@ class IdaFeatureExtractor(FeatureExtractor):
|
||||
def get_base_address(self):
|
||||
return idaapi.get_imagebase()
|
||||
|
||||
def extract_file_features(self):
|
||||
for (feature, ea) in capa.features.extractors.ida.file.extract_features():
|
||||
yield feature, ea
|
||||
def extract_global_features(self):
|
||||
yield from self.global_features
|
||||
|
||||
def extract_file_features(self):
|
||||
yield from capa.features.extractors.ida.file.extract_features()
|
||||
|
||||
def get_functions(self):
|
||||
import capa.features.extractors.ida.helpers as ida_helpers
|
||||
|
||||
@@ -90,9 +91,7 @@ class IdaFeatureExtractor(FeatureExtractor):
|
||||
return FunctionHandle(f)
|
||||
|
||||
def extract_function_features(self, f):
|
||||
for (feature, ea) in capa.features.extractors.ida.function.extract_features(f):
|
||||
yield feature, ea
|
||||
yield from self.global_features
|
||||
yield from capa.features.extractors.ida.function.extract_features(f)
|
||||
|
||||
def get_basic_blocks(self, f):
|
||||
import capa.features.extractors.ida.helpers as ida_helpers
|
||||
@@ -101,9 +100,7 @@ class IdaFeatureExtractor(FeatureExtractor):
|
||||
yield BasicBlockHandle(bb)
|
||||
|
||||
def extract_basic_block_features(self, f, bb):
|
||||
for (feature, ea) in capa.features.extractors.ida.basicblock.extract_features(f, bb):
|
||||
yield feature, ea
|
||||
yield from self.global_features
|
||||
yield from capa.features.extractors.ida.basicblock.extract_features(f, bb)
|
||||
|
||||
def get_instructions(self, f, bb):
|
||||
import capa.features.extractors.ida.helpers as ida_helpers
|
||||
@@ -112,6 +109,4 @@ class IdaFeatureExtractor(FeatureExtractor):
|
||||
yield InstructionHandle(insn)
|
||||
|
||||
def extract_insn_features(self, f, bb, insn):
|
||||
for (feature, ea) in capa.features.extractors.ida.insn.extract_features(f, bb, insn):
|
||||
yield feature, ea
|
||||
yield from self.global_features
|
||||
yield from capa.features.extractors.ida.insn.extract_features(f, bb, insn)
|
||||
|
||||
@@ -148,8 +148,28 @@ FILE_HANDLERS = (
|
||||
extract_file_section_names,
|
||||
extract_file_strings,
|
||||
extract_file_function_names,
|
||||
extract_file_os,
|
||||
extract_file_format,
|
||||
)
|
||||
|
||||
|
||||
def extract_global_features(pe, buf):
|
||||
"""
|
||||
extract global features from given workspace
|
||||
|
||||
args:
|
||||
pe (pefile.PE): the parsed PE
|
||||
buf: the raw sample bytes
|
||||
|
||||
yields:
|
||||
Tuple[Feature, VA]: a feature and its location.
|
||||
"""
|
||||
for handler in GLOBAL_HANDLERS:
|
||||
for feature, va in handler(pe=pe, buf=buf):
|
||||
yield feature, va
|
||||
|
||||
|
||||
GLOBAL_HANDLERS = (
|
||||
extract_file_os,
|
||||
extract_file_arch,
|
||||
)
|
||||
|
||||
@@ -163,12 +183,17 @@ class PefileFeatureExtractor(FeatureExtractor):
|
||||
def get_base_address(self):
|
||||
return self.pe.OPTIONAL_HEADER.ImageBase
|
||||
|
||||
def extract_global_features(self):
|
||||
with open(self.path, "rb") as f:
|
||||
buf = f.read()
|
||||
|
||||
yield from extract_global_features(self.pe, buf)
|
||||
|
||||
def extract_file_features(self):
|
||||
with open(self.path, "rb") as f:
|
||||
buf = f.read()
|
||||
|
||||
for feature, va in extract_file_features(self.pe, buf):
|
||||
yield feature, va
|
||||
yield from extract_file_features(self.pe, buf)
|
||||
|
||||
def get_functions(self):
|
||||
raise NotImplementedError("PefileFeatureExtract can only be used to extract file features")
|
||||
|
||||
@@ -25,34 +25,29 @@ class SmdaFeatureExtractor(FeatureExtractor):
|
||||
def get_base_address(self):
|
||||
return self.smda_report.base_addr
|
||||
|
||||
def extract_file_features(self):
|
||||
for feature, va in capa.features.extractors.smda.file.extract_features(self.smda_report, self.buf):
|
||||
yield feature, va
|
||||
def extract_global_features(self):
|
||||
yield from self.global_features
|
||||
|
||||
def extract_file_features(self):
|
||||
yield from capa.features.extractors.smda.file.extract_features(self.smda_report, self.buf)
|
||||
|
||||
def get_functions(self):
|
||||
for function in self.smda_report.getFunctions():
|
||||
yield function
|
||||
|
||||
def extract_function_features(self, f):
|
||||
for feature, va in capa.features.extractors.smda.function.extract_features(f):
|
||||
yield feature, va
|
||||
yield from self.global_features
|
||||
yield from capa.features.extractors.smda.function.extract_features(f)
|
||||
|
||||
def get_basic_blocks(self, f):
|
||||
for bb in f.getBlocks():
|
||||
yield bb
|
||||
|
||||
def extract_basic_block_features(self, f, bb):
|
||||
for feature, va in capa.features.extractors.smda.basicblock.extract_features(f, bb):
|
||||
yield feature, va
|
||||
yield from self.global_features
|
||||
yield from capa.features.extractors.smda.basicblock.extract_features(f, bb)
|
||||
|
||||
def get_instructions(self, f, bb):
|
||||
for smda_ins in bb.getInstructions():
|
||||
yield smda_ins
|
||||
|
||||
def extract_insn_features(self, f, bb, insn):
|
||||
for feature, va in capa.features.extractors.smda.insn.extract_features(f, bb, insn):
|
||||
yield feature, va
|
||||
yield from self.global_features
|
||||
yield from capa.features.extractors.smda.insn.extract_features(f, bb, insn)
|
||||
|
||||
@@ -51,36 +51,31 @@ class VivisectFeatureExtractor(FeatureExtractor):
|
||||
# assume there is only one file loaded into the vw
|
||||
return list(self.vw.filemeta.values())[0]["imagebase"]
|
||||
|
||||
def extract_file_features(self):
|
||||
for feature, va in capa.features.extractors.viv.file.extract_features(self.vw, self.buf):
|
||||
yield feature, va
|
||||
def extract_global_features(self):
|
||||
yield from self.global_features
|
||||
|
||||
def extract_file_features(self):
|
||||
yield from capa.features.extractors.viv.file.extract_features(self.vw, self.buf)
|
||||
|
||||
def get_functions(self):
|
||||
for va in sorted(self.vw.getFunctions()):
|
||||
yield viv_utils.Function(self.vw, va)
|
||||
|
||||
def extract_function_features(self, f):
|
||||
for feature, va in capa.features.extractors.viv.function.extract_features(f):
|
||||
yield feature, va
|
||||
yield from self.global_features
|
||||
yield from capa.features.extractors.viv.function.extract_features(f)
|
||||
|
||||
def get_basic_blocks(self, f):
|
||||
return f.basic_blocks
|
||||
|
||||
def extract_basic_block_features(self, f, bb):
|
||||
for feature, va in capa.features.extractors.viv.basicblock.extract_features(f, bb):
|
||||
yield feature, va
|
||||
yield from self.global_features
|
||||
yield from capa.features.extractors.viv.basicblock.extract_features(f, bb)
|
||||
|
||||
def get_instructions(self, f, bb):
|
||||
for insn in bb.instructions:
|
||||
yield InstructionHandle(insn)
|
||||
|
||||
def extract_insn_features(self, f, bb, insn):
|
||||
for feature, va in capa.features.extractors.viv.insn.extract_features(f, bb, insn):
|
||||
yield feature, va
|
||||
yield from self.global_features
|
||||
yield from capa.features.extractors.viv.insn.extract_features(f, bb, insn)
|
||||
|
||||
def is_library_function(self, va):
|
||||
return viv_utils.flirt.is_library_function(self.vw, va)
|
||||
|
||||
@@ -19,6 +19,10 @@ json format:
|
||||
...
|
||||
},
|
||||
'scopes': {
|
||||
'global': [
|
||||
(str(name), [any(arg), ...], int(va), ()),
|
||||
...
|
||||
},
|
||||
'file': [
|
||||
(str(name), [any(arg), ...], int(va), ()),
|
||||
...
|
||||
@@ -52,7 +56,6 @@ See the License for the specific language governing permissions and limitations
|
||||
import json
|
||||
import zlib
|
||||
import logging
|
||||
import os.path
|
||||
|
||||
import capa.features.file
|
||||
import capa.features.insn
|
||||
@@ -91,12 +94,15 @@ def dumps(extractor):
|
||||
"base address": extractor.get_base_address(),
|
||||
"functions": {},
|
||||
"scopes": {
|
||||
"global": [],
|
||||
"file": [],
|
||||
"function": [],
|
||||
"basic block": [],
|
||||
"instruction": [],
|
||||
},
|
||||
}
|
||||
for feature, va in extractor.extract_global_features():
|
||||
ret["scopes"]["global"].append(serialize_feature(feature) + (hex(va), ()))
|
||||
|
||||
for feature, va in extractor.extract_file_features():
|
||||
ret["scopes"]["file"].append(serialize_feature(feature) + (hex(va), ()))
|
||||
@@ -151,6 +157,7 @@ def loads(s):
|
||||
|
||||
features = {
|
||||
"base address": doc.get("base address"),
|
||||
"global features": [],
|
||||
"file features": [],
|
||||
"functions": {},
|
||||
}
|
||||
@@ -180,6 +187,12 @@ def loads(s):
|
||||
# ('MatchedRule', ('foo', ), '0x401000', ('0x401000', ))
|
||||
# ^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^ ^^^^^^^^^^^^^^
|
||||
# feature name args addr func/bb/insn
|
||||
for feature in doc.get("scopes", {}).get("global", []):
|
||||
va, loc = feature[2:]
|
||||
va = int(va, 0x10)
|
||||
feature = deserialize_feature(feature[:2])
|
||||
features["global features"].append((va, feature))
|
||||
|
||||
for feature in doc.get("scopes", {}).get("file", []):
|
||||
va, loc = feature[2:]
|
||||
va = int(va, 0x10)
|
||||
|
||||
12
capa/main.py
12
capa/main.py
@@ -79,7 +79,7 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f:
|
||||
function_features = collections.defaultdict(set) # type: FeatureSet
|
||||
bb_matches = collections.defaultdict(list) # type: MatchResults
|
||||
|
||||
for feature, va in extractor.extract_function_features(f):
|
||||
for feature, va in itertools.chain(extractor.extract_function_features(f), extractor.extract_global_features()):
|
||||
function_features[feature].add(va)
|
||||
|
||||
for bb in extractor.get_basic_blocks(f):
|
||||
@@ -88,12 +88,16 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f:
|
||||
# - basic blocks
|
||||
bb_features = collections.defaultdict(set)
|
||||
|
||||
for feature, va in extractor.extract_basic_block_features(f, bb):
|
||||
for feature, va in itertools.chain(
|
||||
extractor.extract_basic_block_features(f, bb), extractor.extract_global_features()
|
||||
):
|
||||
bb_features[feature].add(va)
|
||||
function_features[feature].add(va)
|
||||
|
||||
for insn in extractor.get_instructions(f, bb):
|
||||
for feature, va in extractor.extract_insn_features(f, bb, insn):
|
||||
for feature, va in itertools.chain(
|
||||
extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features()
|
||||
):
|
||||
bb_features[feature].add(va)
|
||||
function_features[feature].add(va)
|
||||
|
||||
@@ -112,7 +116,7 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f:
|
||||
def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet):
|
||||
file_features = collections.defaultdict(set) # type: FeatureSet
|
||||
|
||||
for feature, va in extractor.extract_file_features():
|
||||
for feature, va in itertools.chain(extractor.extract_file_features(), extractor.extract_global_features()):
|
||||
# not all file features may have virtual addresses.
|
||||
# if not, then at least ensure the feature shows up in the index.
|
||||
# the set of addresses will still be empty.
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
import os
|
||||
import os.path
|
||||
import binascii
|
||||
import itertools
|
||||
import contextlib
|
||||
import collections
|
||||
from functools import lru_cache
|
||||
@@ -133,6 +134,13 @@ def get_pefile_extractor(path):
|
||||
return capa.features.extractors.pefile.PefileFeatureExtractor(path)
|
||||
|
||||
|
||||
def extract_global_features(extractor):
|
||||
features = collections.defaultdict(set)
|
||||
for feature, va in extractor.extract_global_features():
|
||||
features[feature].add(va)
|
||||
return features
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def extract_file_features(extractor):
|
||||
features = collections.defaultdict(set)
|
||||
@@ -288,7 +296,10 @@ def resolve_scope(scope):
|
||||
if scope == "file":
|
||||
|
||||
def inner_file(extractor):
|
||||
return extract_file_features(extractor)
|
||||
features = extract_file_features(extractor)
|
||||
for k, vs in extract_global_features(extractor).items():
|
||||
features[k].update(vs)
|
||||
return features
|
||||
|
||||
inner_file.__name__ = scope
|
||||
return inner_file
|
||||
@@ -301,7 +312,10 @@ def resolve_scope(scope):
|
||||
def inner_bb(extractor):
|
||||
f = get_function(extractor, fva)
|
||||
bb = get_basic_block(extractor, f, bbva)
|
||||
return extract_basic_block_features(extractor, f, bb)
|
||||
features = extract_basic_block_features(extractor, f, bb)
|
||||
for k, vs in extract_global_features(extractor).items():
|
||||
features[k].update(vs)
|
||||
return features
|
||||
|
||||
inner_bb.__name__ = scope
|
||||
return inner_bb
|
||||
@@ -311,7 +325,10 @@ def resolve_scope(scope):
|
||||
|
||||
def inner_function(extractor):
|
||||
f = get_function(extractor, va)
|
||||
return extract_function_features(extractor, f)
|
||||
features = extract_function_features(extractor, f)
|
||||
for k, vs in extract_global_features(extractor).items():
|
||||
features[k].update(vs)
|
||||
return features
|
||||
|
||||
inner_function.__name__ = scope
|
||||
return inner_function
|
||||
|
||||
Reference in New Issue
Block a user