diff --git a/capa/features/__init__.py b/capa/features/__init__.py index 956d5f17..2f14a75e 100644 --- a/capa/features/__init__.py +++ b/capa/features/__init__.py @@ -16,6 +16,12 @@ import capa.engine logger = logging.getLogger(__name__) MAX_BYTES_FEATURE_SIZE = 0x100 +# identifiers for supported architectures names that tweak a feature +# for example, offset/x32 +ARCH_X32 = "x32" +ARCH_X64 = "x64" +VALID_ARCH = (ARCH_X32, ARCH_X64) + def bytes_to_str(b): if sys.version_info[0] >= 3: @@ -30,21 +36,42 @@ def hex_string(h): class Feature(object): - def __init__(self, value, description=None): + def __init__(self, value, arch=None, description=None): + """ + Args: + value (any): the value of the feature, such as the number or string. + arch (str): one of the VALID_ARCH values, or None. + When None, then the feature applies to any architecture. + Modifies the feature name from `feature` to `feature/arch`, like `offset/x32`. + description (str): a human-readable description that explains the feature value. + """ super(Feature, self).__init__() - self.name = self.__class__.__name__.lower() + + if arch is not None: + if arch not in VALID_ARCH: + print(value, arch, description) + raise ValueError("arch '%s' must be one of %s" % (arch, VALID_ARCH)) + self.name = self.__class__.__name__.lower() + "/" + arch + else: + self.name = self.__class__.__name__.lower() + self.value = value + self.arch = arch self.description = description def __hash__(self): - return hash((self.name, self.value)) + return hash((self.name, self.value, self.arch)) def __eq__(self, other): - return self.name == other.name and self.value == other.value + return self.name == other.name and self.value == other.value and self.arch == other.arch - # Used to overwrite the rendering of the feature value in `__str__` and the - # json output def get_value_str(self): + """ + render the value of this feature, for use by `__str__` and friends. + subclasses should override to customize the rendering. + + Returns: any + """ return self.value def __str__(self): @@ -62,36 +89,44 @@ class Feature(object): def evaluate(self, ctx): return capa.engine.Result(self in ctx, self, [], locations=ctx.get(self, [])) - def serialize(self): - return self.__dict__ - def freeze_serialize(self): - return (self.__class__.__name__, [self.value]) + if self.arch is not None: + return (self.__class__.__name__, [self.value, {"arch": self.arch}]) + else: + return (self.__class__.__name__, [self.value]) @classmethod def freeze_deserialize(cls, args): - return cls(*args) + # as you can see below in code, + # if the last argument is a dictionary, + # consider it to be kwargs passed to the feature constructor. + if len(args) == 1: + return cls(*args) + elif isinstance(args[-1], dict): + kwargs = args[-1] + args = args[:-1] + return cls(*args, **kwargs) class MatchedRule(Feature): def __init__(self, value, description=None): - super(MatchedRule, self).__init__(value, description) + super(MatchedRule, self).__init__(value, description=description) self.name = "match" class Characteristic(Feature): def __init__(self, value, description=None): - super(Characteristic, self).__init__(value, description) + super(Characteristic, self).__init__(value, description=description) class String(Feature): def __init__(self, value, description=None): - super(String, self).__init__(value, description) + super(String, self).__init__(value, description=description) class Regex(String): def __init__(self, value, description=None): - super(Regex, self).__init__(value, description) + super(Regex, self).__init__(value, description=description) pat = self.value[len("/") : -len("/")] flags = re.DOTALL if value.endswith("/i"): @@ -129,13 +164,13 @@ class Regex(String): class StringFactory(object): def __new__(self, value, description): if value.startswith("/") and (value.endswith("/") or value.endswith("/i")): - return Regex(value, description) - return String(value, description) + return Regex(value, description=description) + return String(value, description=description) class Bytes(Feature): def __init__(self, value, description=None): - super(Bytes, self).__init__(value, description) + super(Bytes, self).__init__(value, description=description) def evaluate(self, ctx): for feature, locations in ctx.items(): diff --git a/capa/features/extractors/ida/insn.py b/capa/features/extractors/ida/insn.py index 6440164b..e72970c7 100644 --- a/capa/features/extractors/ida/insn.py +++ b/capa/features/extractors/ida/insn.py @@ -12,12 +12,23 @@ import idautils import capa.features.extractors.helpers import capa.features.extractors.ida.helpers -from capa.features import MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic +from capa.features import ARCH_X32, ARCH_X64, MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic from capa.features.insn import Number, Offset, Mnemonic _file_imports_cache = None +def get_arch(): + # https://reverseengineering.stackexchange.com/a/11398/17194 + info = idaapi.get_inf_structure() + if info.is_64bit(): + return ARCH_X64 + elif info.is_32bit(): + return ARCH_X32 + else: + raise ValueError("unexpected architecture") + + def get_imports(): """ """ global _file_imports_cache @@ -88,6 +99,7 @@ def extract_insn_number_features(f, bb, insn): const = capa.features.extractors.ida.helpers.mask_op_val(op) if not idaapi.is_mapped(const): yield Number(const), insn.ea + yield Number(const, arch=get_arch()), insn.ea def extract_insn_bytes_features(f, bb, insn): @@ -155,6 +167,7 @@ def extract_insn_offset_features(f, bb, insn): op_off = capa.features.extractors.helpers.twos_complement(op_off, 32) yield Offset(op_off), insn.ea + yield Offset(op_off, arch=get_arch()), insn.ea def contains_stack_cookie_keywords(s): diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py index ef545ffd..2629e0dc 100644 --- a/capa/features/extractors/viv/insn.py +++ b/capa/features/extractors/viv/insn.py @@ -11,7 +11,7 @@ import vivisect.const import envi.archs.i386.disasm import capa.features.extractors.helpers -from capa.features import MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic +from capa.features import ARCH_X32, ARCH_X64, MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic from capa.features.insn import Number, Offset, Mnemonic from capa.features.extractors.viv.indirect_calls import NotFoundError, resolve_indirect_call @@ -20,6 +20,14 @@ from capa.features.extractors.viv.indirect_calls import NotFoundError, resolve_i SECURITY_COOKIE_BYTES_DELTA = 0x40 +def get_arch(vw): + arch = vw.getMeta("Architecture") + if arch == "i386": + return ARCH_X32 + elif arch == "amd64": + return ARCH_X64 + + def interface_extract_instruction_XXX(f, bb, insn): """ parse features from the given instruction. @@ -138,6 +146,7 @@ def extract_insn_number_features(f, bb, insn): return yield Number(v), insn.va + yield Number(v, arch=get_arch(f.vw)), insn.va def extract_insn_bytes_features(f, bb, insn): @@ -266,8 +275,10 @@ def extract_insn_offset_features(f, bb, insn): continue # viv already decodes offsets as signed + v = oper.disp - yield Offset(oper.disp), insn.va + yield Offset(v), insn.va + yield Offset(v, arch=get_arch(f.vw)), insn.va def is_security_cookie(f, bb, insn): diff --git a/capa/features/file.py b/capa/features/file.py index b5bea29c..f4629a55 100644 --- a/capa/features/file.py +++ b/capa/features/file.py @@ -12,16 +12,16 @@ from capa.features import Feature class Export(Feature): def __init__(self, value, description=None): # value is export name - super(Export, self).__init__(value, description) + super(Export, self).__init__(value, description=description) class Import(Feature): def __init__(self, value, description=None): # value is import name - super(Import, self).__init__(value, description) + super(Import, self).__init__(value, description=description) class Section(Feature): def __init__(self, value, description=None): # value is section name - super(Section, self).__init__(value, description) + super(Section, self).__init__(value, description=description) diff --git a/capa/features/insn.py b/capa/features/insn.py index 5b33aeb6..bd8b4c94 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -20,16 +20,16 @@ class API(Feature): class Number(Feature): - def __init__(self, value, description=None): - super(Number, self).__init__(value, description) + def __init__(self, value, arch=None, description=None): + super(Number, self).__init__(value, arch=arch, description=description) def get_value_str(self): return "0x%X" % self.value class Offset(Feature): - def __init__(self, value, description=None): - super(Offset, self).__init__(value, description) + def __init__(self, value, arch=None, description=None): + super(Offset, self).__init__(value, arch=arch, description=description) def get_value_str(self): return "0x%X" % self.value @@ -37,4 +37,4 @@ class Offset(Feature): class Mnemonic(Feature): def __init__(self, value, description=None): - super(Mnemonic, self).__init__(value, description) + super(Mnemonic, self).__init__(value, description=description) diff --git a/capa/rules.py b/capa/rules.py index 92cd0c56..88520527 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -195,8 +195,14 @@ def parse_feature(key): return capa.features.Bytes elif key == "number": return capa.features.insn.Number + elif key.startswith("number/"): + arch = key.partition("/")[2] + return lambda *args, **kwargs: capa.features.insn.Number(*args, arch=arch, **kwargs) elif key == "offset": return capa.features.insn.Offset + elif key.startswith("offset/"): + arch = key.partition("/")[2] + return lambda *args, **kwargs: capa.features.insn.Offset(*args, arch=arch, **kwargs) elif key == "mnemonic": return capa.features.insn.Mnemonic elif key == "basic blocks": @@ -325,7 +331,7 @@ def build_statements(d, scope): # count(number(0x100 = description)) if term != "string": value, description = parse_description(arg, term) - feature = Feature(value, description) + feature = Feature(value, description=description) else: # arg is string (which doesn't support inline descriptions), like: # @@ -358,7 +364,7 @@ def build_statements(d, scope): Feature = parse_feature(key) value, description = parse_description(d[key], key, d.get("description")) try: - feature = Feature(value, description) + feature = Feature(value, description=description) except ValueError as e: raise InvalidRule(str(e)) ensure_feature_valid_for_scope(scope, feature) diff --git a/tests/test_engine.py b/tests/test_engine.py index c959283e..0f868149 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -444,3 +444,15 @@ def test_match_namespace(): assert "WriteFile API" in matches assert "file-create" not in matches assert "filesystem-any" in matches + + +def test_render_number(): + assert str(capa.features.insn.Number(1)) == "number(0x1)" + assert str(capa.features.insn.Number(1, arch=ARCH_X32)) == "number/x32(0x1)" + assert str(capa.features.insn.Number(1, arch=ARCH_X64)) == "number/x64(0x1)" + + +def test_render_offset(): + assert str(capa.features.insn.Offset(1)) == "offset(0x1)" + assert str(capa.features.insn.Offset(1, arch=ARCH_X32)) == "offset/x32(0x1)" + assert str(capa.features.insn.Offset(1, arch=ARCH_X64)) == "offset/x64(0x1)" diff --git a/tests/test_ida_features.py b/tests/test_ida_features.py index 562158f1..a5774588 100644 --- a/tests/test_ida_features.py +++ b/tests/test_ida_features.py @@ -1,5 +1,6 @@ # run this script from within IDA with ./tests/data/mimikatz.exe open import logging +import binascii import traceback import collections @@ -9,6 +10,7 @@ import capa.features import capa.features.file import capa.features.insn import capa.features.basicblock +from capa.features import ARCH_X32, ARCH_X64 logger = logging.getLogger("test_ida_features") @@ -17,9 +19,14 @@ def check_input_file(): import idautils wanted = "5f66b82558ca92e54e77f216ef4c066c" - # some versions of IDA return a truncated version of the MD5. + # some versions (7.4) of IDA return a truncated version of the MD5. # https://github.com/idapython/bin/issues/11 - found = idautils.GetInputFileMD5().rstrip(b"\x00").decode("ascii").lower() + try: + found = idautils.GetInputFileMD5()[:31].decode("ascii").lower() + except UnicodeDecodeError: + # in IDA 7.5 or so, GetInputFileMD5 started returning raw binary + # rather than the hex digest + found = binascii.hexlify(idautils.GetInputFileMD5()[:15]).decode("ascii").lower() if not wanted.startswith(found): raise RuntimeError("please run the tests against `mimikatz.exe`") @@ -122,6 +129,17 @@ def test_number_features(): assert capa.features.insn.Number(0x10) not in features +@pytest.mark.skip(reason="IDA Pro tests must be run within IDA") +def test_number_arch_features(): + import idaapi + + f = idaapi.get_func(0x40105D) + features = extract_function_features(f) + assert capa.features.insn.Number(0xFF) in features + assert capa.features.insn.Number(0xFF, arch=ARCH_X32) in features + assert capa.features.insn.Number(0xFF, arch=ARCH_X64) not in features + + @pytest.mark.skip(reason="IDA Pro tests must be run within IDA") def test_offset_features(): import idaapi @@ -144,6 +162,17 @@ def test_offset_features(): assert capa.features.insn.Offset(-0x2) in features +@pytest.mark.skip(reason="IDA Pro tests must be run within IDA") +def test_offset_arch_features(mimikatz): + import idaapi + + f = idaapi.get_func(0x40105D) + features = extract_function_features(f) + assert capa.features.insn.Offset(0x0) in features + assert capa.features.insn.Offset(0x0, arch=ARCH_X32) in features + assert capa.features.insn.Offset(0x0, arch=ARCH_X64) not in features + + @pytest.mark.skip(reason="IDA Pro tests must be run within IDA") def test_nzxor_features(): import idaapi diff --git a/tests/test_rules.py b/tests/test_rules.py index 9a33c0e8..6c70b7e1 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -11,7 +11,7 @@ import textwrap import pytest import capa.rules -from capa.features import String +from capa.features import ARCH_X32, ARCH_X64, String from capa.features.insn import Number, Offset @@ -380,10 +380,10 @@ def test_number_symbol(): children = list(r.statement.get_children()) assert (Number(1) in children) == True assert (Number(0xFFFFFFFF) in children) == True - assert (Number(2, "symbol name") in children) == True - assert (Number(3, "symbol name") in children) == True - assert (Number(4, "symbol name = another name") in children) == True - assert (Number(0x100, "symbol name") in children) == True + assert (Number(2, description="symbol name") in children) == True + assert (Number(3, description="symbol name") in children) == True + assert (Number(4, description="symbol name = another name") in children) == True + assert (Number(0x100, description="symbol name") in children) == True def test_count_number_symbol(): @@ -403,8 +403,8 @@ def test_count_number_symbol(): assert r.evaluate({Number(2): {}}) == False assert r.evaluate({Number(2): {1}}) == True assert r.evaluate({Number(2): {1, 2}}) == False - assert r.evaluate({Number(0x100, "symbol name"): {1}}) == False - assert r.evaluate({Number(0x100, "symbol name"): {1, 2, 3}}) == True + assert r.evaluate({Number(0x100, description="symbol name"): {1}}) == False + assert r.evaluate({Number(0x100, description="symbol name"): {1, 2, 3}}) == True def test_invalid_number(): @@ -448,6 +448,24 @@ def test_invalid_number(): ) +def test_number_arch(): + r = capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + features: + - number/x32: 2 + """ + ) + ) + assert r.evaluate({Number(2, arch=ARCH_X32): {1}}) == True + + assert r.evaluate({Number(2): {1}}) == False + assert r.evaluate({Number(2, arch=ARCH_X64): {1}}) == False + + def test_offset_symbol(): rule = textwrap.dedent( """ @@ -466,10 +484,10 @@ def test_offset_symbol(): r = capa.rules.Rule.from_yaml(rule) children = list(r.statement.get_children()) assert (Offset(1) in children) == True - assert (Offset(2, "symbol name") in children) == True - assert (Offset(3, "symbol name") in children) == True - assert (Offset(4, "symbol name = another name") in children) == True - assert (Offset(0x100, "symbol name") in children) == True + assert (Offset(2, description="symbol name") in children) == True + assert (Offset(3, description="symbol name") in children) == True + assert (Offset(4, description="symbol name = another name") in children) == True + assert (Offset(0x100, description="symbol name") in children) == True def test_count_offset_symbol(): @@ -489,8 +507,67 @@ def test_count_offset_symbol(): assert r.evaluate({Offset(2): {}}) == False assert r.evaluate({Offset(2): {1}}) == True assert r.evaluate({Offset(2): {1, 2}}) == False - assert r.evaluate({Offset(0x100, "symbol name"): {1}}) == False - assert r.evaluate({Offset(0x100, "symbol name"): {1, 2, 3}}) == True + assert r.evaluate({Offset(0x100, description="symbol name"): {1}}) == False + assert r.evaluate({Offset(0x100, description="symbol name"): {1, 2, 3}}) == True + + +def test_offset_arch(): + r = capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + features: + - offset/x32: 2 + """ + ) + ) + assert r.evaluate({Offset(2, arch=ARCH_X32): {1}}) == True + + assert r.evaluate({Offset(2): {1}}) == False + assert r.evaluate({Offset(2, arch=ARCH_X64): {1}}) == False + + +def test_invalid_offset(): + with pytest.raises(capa.rules.InvalidRule): + r = capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + features: + - offset: "this is a string" + """ + ) + ) + + with pytest.raises(capa.rules.InvalidRule): + r = capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + features: + - offset: 2= + """ + ) + ) + + with pytest.raises(capa.rules.InvalidRule): + r = capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + features: + - offset: symbol name = 2 + """ + ) + ) def test_invalid_string_values_int(): @@ -566,47 +643,6 @@ def test_regex_values_always_string(): assert capa.features.MatchedRule("test rule") in features -def test_invalid_offset(): - with pytest.raises(capa.rules.InvalidRule): - r = capa.rules.Rule.from_yaml( - textwrap.dedent( - """ - rule: - meta: - name: test rule - features: - - offset: "this is a string" - """ - ) - ) - - with pytest.raises(capa.rules.InvalidRule): - r = capa.rules.Rule.from_yaml( - textwrap.dedent( - """ - rule: - meta: - name: test rule - features: - - offset: 2= - """ - ) - ) - - with pytest.raises(capa.rules.InvalidRule): - r = capa.rules.Rule.from_yaml( - textwrap.dedent( - """ - rule: - meta: - name: test rule - features: - - offset: symbol name = 2 - """ - ) - ) - - def test_filter_rules(): rules = capa.rules.RuleSet( [ diff --git a/tests/test_viv_features.py b/tests/test_viv_features.py index dae5339b..67c38ee4 100644 --- a/tests/test_viv_features.py +++ b/tests/test_viv_features.py @@ -17,6 +17,7 @@ import capa.features.extractors.viv.file import capa.features.extractors.viv.insn import capa.features.extractors.viv.function import capa.features.extractors.viv.basicblock +from capa.features import ARCH_X32, ARCH_X64 def extract_file_features(vw, path): @@ -108,6 +109,13 @@ def test_number_features(mimikatz): assert capa.features.insn.Number(0x10) not in features +def test_number_arch_features(mimikatz): + features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x40105D)) + assert capa.features.insn.Number(0xFF) in features + assert capa.features.insn.Number(0xFF, arch=ARCH_X32) in features + assert capa.features.insn.Number(0xFF, arch=ARCH_X64) not in features + + def test_offset_features(mimikatz): features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x40105D)) assert capa.features.insn.Offset(0x0) in features @@ -125,6 +133,13 @@ def test_offset_features(mimikatz): assert capa.features.insn.Offset(-0x2) in features +def test_offset_arch_features(mimikatz): + features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x40105D)) + assert capa.features.insn.Offset(0x0) in features + assert capa.features.insn.Offset(0x0, arch=ARCH_X32) in features + assert capa.features.insn.Offset(0x0, arch=ARCH_X64) not in features + + def test_nzxor_features(mimikatz): features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x410DFC)) assert capa.features.Characteristic("nzxor") in features # 0x0410F0B