base_extractor(): add a StaticFeatureExtractor and DynamicFeatureExtractor base classes, as well as a FeatureExtractor type alias

This commit is contained in:
Yacine Elhamer
2023-06-25 22:57:39 +01:00
parent 708cb28ed0
commit 37ed138dcf
14 changed files with 52 additions and 36 deletions

View File

@@ -63,16 +63,18 @@ class InsnHandle:
inner: Any
class FeatureExtractor:
class StaticFeatureExtractor:
"""
FeatureExtractor defines the interface for fetching features from a sample.
StaticFeatureExtractor defines the interface for fetching features from a
sample without running it; extractors that rely on the execution trace of
a sample must implement the other sibling class, DynamicFeatureExtracor.
There may be multiple backends that support fetching features for capa.
For example, we use vivisect by default, but also want to support saving
and restoring features from a JSON file.
When we restore the features, we'd like to use exactly the same matching logic
to find matching rules.
Therefore, we can define a FeatureExtractor that provides features from the
Therefore, we can define a StaticFeatureExtractor that provides features from the
serialized JSON file and do matching without a binary analysis pass.
Also, this provides a way to hook in an IDA backend.
@@ -292,9 +294,11 @@ class ThreadHandle:
inner: Any
class DynamicExtractor(FeatureExtractor):
class DynamicFeatureExtractor:
"""
DynamicExtractor defines the interface for fetching features from a sandbox' analysis of a sample.
DynamicFeatureExtractor defines the interface for fetching features from a
sandbox' analysis of a sample; extractors that rely on statically analyzing
a sample must implement the sibling extractor, StaticFeatureExtractor.
Features are grouped mainly into threads that alongside their meta-features are also grouped into
processes (that also have their own features). Other scopes (such as function and file) may also apply
@@ -336,3 +340,6 @@ class DynamicExtractor(FeatureExtractor):
- network activity
"""
raise NotImplementedError()
FeatureExtractor = StaticFeatureExtractor | DynamicFeatureExtractor

View File

@@ -17,10 +17,10 @@ import capa.features.extractors.binja.function
import capa.features.extractors.binja.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor
class BinjaFeatureExtractor(FeatureExtractor):
class BinjaFeatureExtractor(StaticFeatureExtractor):
def __init__(self, bv: binja.BinaryView):
super().__init__()
self.bv = bv

View File

@@ -14,12 +14,12 @@ import capa.features.extractors.cape.global_
import capa.features.extractors.cape.process
from capa.features.common import Feature
from capa.features.address import Address
from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle, DynamicExtractor
from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle, DynamicFeatureExtractor
logger = logging.getLogger(__name__)
class CapeExtractor(DynamicExtractor):
class CapeExtractor(DynamicFeatureExtractor):
def __init__(self, static: Dict, behavior: Dict):
super().__init__()
self.static = static

View File

@@ -14,7 +14,7 @@ import capa.features.extractors.cape.global_
import capa.features.extractors.cape.process
from capa.features.common import String, Feature
from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle, DynamicExtractor
from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle
logger = logging.getLogger(__name__)

View File

@@ -21,7 +21,7 @@ import capa.features.extractors.dnfile.function
from capa.features.common import Feature
from capa.features.address import NO_ADDRESS, Address, DNTokenAddress, DNTokenOffsetAddress
from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor
from capa.features.extractors.dnfile.helpers import (
get_dotnet_types,
get_dotnet_fields,
@@ -67,7 +67,7 @@ class DnFileFeatureExtractorCache:
return self.types.get(token, None)
class DnfileFeatureExtractor(FeatureExtractor):
class DnfileFeatureExtractor(StaticFeatureExtractor):
def __init__(self, path: str):
super().__init__()
self.pe: dnfile.dnPE = dnfile.dnPE(path)

View File

@@ -17,7 +17,7 @@ from capa.features.common import (
Feature,
)
from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import FeatureExtractor
from capa.features.extractors.base_extractor import StaticFeatureExtractor
logger = logging.getLogger(__name__)
@@ -73,7 +73,7 @@ GLOBAL_HANDLERS = (
)
class DnfileFeatureExtractor(FeatureExtractor):
class DnfileFeatureExtractor(StaticFeatureExtractor):
def __init__(self, path: str):
super().__init__()
self.path: str = path

View File

@@ -23,7 +23,7 @@ from capa.features.common import (
Characteristic,
)
from capa.features.address import NO_ADDRESS, Address, DNTokenAddress
from capa.features.extractors.base_extractor import FeatureExtractor
from capa.features.extractors.base_extractor import StaticFeatureExtractor
from capa.features.extractors.dnfile.helpers import (
DnType,
iter_dotnet_table,
@@ -157,7 +157,7 @@ GLOBAL_HANDLERS = (
)
class DotnetFileFeatureExtractor(FeatureExtractor):
class DotnetFileFeatureExtractor(StaticFeatureExtractor):
def __init__(self, path: str):
super().__init__()
self.path: str = path

View File

@@ -15,7 +15,7 @@ import capa.features.extractors.common
from capa.features.file import Import, Section
from capa.features.common import OS, FORMAT_ELF, Arch, Format, Feature
from capa.features.address import NO_ADDRESS, FileOffsetAddress, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import FeatureExtractor
from capa.features.extractors.base_extractor import StaticFeatureExtractor
logger = logging.getLogger(__name__)
@@ -106,7 +106,7 @@ GLOBAL_HANDLERS = (
)
class ElfFeatureExtractor(FeatureExtractor):
class ElfFeatureExtractor(StaticFeatureExtractor):
def __init__(self, path: str):
super().__init__()
self.path = path

View File

@@ -18,10 +18,10 @@ import capa.features.extractors.ida.function
import capa.features.extractors.ida.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor
class IdaFeatureExtractor(FeatureExtractor):
class IdaFeatureExtractor(StaticFeatureExtractor):
def __init__(self):
super().__init__()
self.global_features: List[Tuple[Feature, Address]] = []

View File

@@ -3,7 +3,7 @@ from dataclasses import dataclass
from capa.features.common import Feature
from capa.features.address import NO_ADDRESS, Address
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor
@dataclass
@@ -24,7 +24,7 @@ class FunctionFeatures:
@dataclass
class NullFeatureExtractor(FeatureExtractor):
class NullFeatureExtractor(StaticFeatureExtractor):
"""
An extractor that extracts some user-provided features.

View File

@@ -18,7 +18,7 @@ import capa.features.extractors.strings
from capa.features.file import Export, Import, Section
from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Characteristic
from capa.features.address import NO_ADDRESS, FileOffsetAddress, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import FeatureExtractor
from capa.features.extractors.base_extractor import StaticFeatureExtractor
logger = logging.getLogger(__name__)
@@ -172,7 +172,7 @@ GLOBAL_HANDLERS = (
)
class PefileFeatureExtractor(FeatureExtractor):
class PefileFeatureExtractor(StaticFeatureExtractor):
def __init__(self, path: str):
super().__init__()
self.path = path

View File

@@ -19,12 +19,12 @@ import capa.features.extractors.viv.function
import capa.features.extractors.viv.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor
logger = logging.getLogger(__name__)
class VivisectFeatureExtractor(FeatureExtractor):
class VivisectFeatureExtractor(StaticFeatureExtractor):
def __init__(self, vw, path, os):
super().__init__()
self.vw = vw

View File

@@ -226,7 +226,7 @@ class Freeze(BaseModel):
allow_population_by_field_name = True
def dumps(extractor: capa.features.extractors.base_extractor.FeatureExtractor) -> str:
def dumps(extractor: capa.features.extractors.base_extractor.StaticFeatureExtractor) -> str:
"""
serialize the given extractor to a string
"""
@@ -327,7 +327,7 @@ def dumps(extractor: capa.features.extractors.base_extractor.FeatureExtractor) -
return freeze.json()
def loads(s: str) -> capa.features.extractors.base_extractor.FeatureExtractor:
def loads(s: str) -> capa.features.extractors.base_extractor.StaticFeatureExtractor:
"""deserialize a set of features (as a NullFeatureExtractor) from a string."""
import capa.features.extractors.null as null
@@ -363,7 +363,7 @@ def loads(s: str) -> capa.features.extractors.base_extractor.FeatureExtractor:
MAGIC = "capa0000".encode("ascii")
def dump(extractor: capa.features.extractors.base_extractor.FeatureExtractor) -> bytes:
def dump(extractor: capa.features.extractors.base_extractor.StaticFeatureExtractor) -> bytes:
"""serialize the given extractor to a byte array."""
return MAGIC + zlib.compress(dumps(extractor).encode("utf-8"))
@@ -372,7 +372,7 @@ def is_freeze(buf: bytes) -> bool:
return buf[: len(MAGIC)] == MAGIC
def load(buf: bytes) -> capa.features.extractors.base_extractor.FeatureExtractor:
def load(buf: bytes) -> capa.features.extractors.base_extractor.StaticFeatureExtractor:
"""deserialize a set of features (as a NullFeatureExtractor) from a byte array."""
if not is_freeze(buf):
raise ValueError("missing magic header")

View File

@@ -76,7 +76,14 @@ from capa.features.common import (
FORMAT_RESULT,
)
from capa.features.address import NO_ADDRESS, Address
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor
from capa.features.extractors.base_extractor import (
BBHandle,
InsnHandle,
FunctionHandle,
FeatureExtractor,
StaticFeatureExtractor,
DynamicFeatureExtractor,
)
RULES_PATH_DEFAULT_STRING = "(embedded rules)"
SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)"
@@ -117,7 +124,7 @@ def set_vivisect_log_level(level):
def find_instruction_capabilities(
ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle, bb: BBHandle, insn: InsnHandle
ruleset: RuleSet, extractor: StaticFeatureExtractor, f: FunctionHandle, bb: BBHandle, insn: InsnHandle
) -> Tuple[FeatureSet, MatchResults]:
"""
find matches for the given rules for the given instruction.
@@ -144,7 +151,7 @@ def find_instruction_capabilities(
def find_basic_block_capabilities(
ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle, bb: BBHandle
ruleset: RuleSet, extractor: StaticFeatureExtractor, f: FunctionHandle, bb: BBHandle
) -> Tuple[FeatureSet, MatchResults, MatchResults]:
"""
find matches for the given rules within the given basic block.
@@ -184,7 +191,7 @@ def find_basic_block_capabilities(
def find_code_capabilities(
ruleset: RuleSet, extractor: FeatureExtractor, fh: FunctionHandle
ruleset: RuleSet, extractor: StaticFeatureExtractor, fh: FunctionHandle
) -> Tuple[MatchResults, MatchResults, MatchResults, int]:
"""
find matches for the given rules within the given function.
@@ -242,7 +249,9 @@ def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, functi
return matches, len(file_features)
def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_progress=None) -> Tuple[MatchResults, Any]:
def find_capabilities(
ruleset: RuleSet, extractor: StaticFeatureExtractor, disable_progress=None
) -> Tuple[MatchResults, Any]:
all_function_matches = collections.defaultdict(list) # type: MatchResults
all_bb_matches = collections.defaultdict(list) # type: MatchResults
all_insn_matches = collections.defaultdict(list) # type: MatchResults
@@ -744,7 +753,7 @@ def collect_metadata(
format_: str,
os_: str,
rules_path: List[str],
extractor: capa.features.extractors.base_extractor.FeatureExtractor,
extractor: FeatureExtractor,
) -> rdoc.Metadata:
md5 = hashlib.md5()
sha1 = hashlib.sha1()