rules: add support for arch flavors of Number and Offset features

closes #210
This commit is contained in:
William Ballenthin
2020-08-03 16:28:47 -06:00
parent c982c2d04e
commit b81b5e5993
10 changed files with 244 additions and 87 deletions

View File

@@ -16,6 +16,12 @@ import capa.engine
logger = logging.getLogger(__name__)
MAX_BYTES_FEATURE_SIZE = 0x100
# identifiers for supported architectures names that tweak a feature
# for example, offset/x32
ARCH_X32 = "x32"
ARCH_X64 = "x64"
VALID_ARCH = (ARCH_X32, ARCH_X64)
def bytes_to_str(b):
if sys.version_info[0] >= 3:
@@ -30,21 +36,42 @@ def hex_string(h):
class Feature(object):
def __init__(self, value, description=None):
def __init__(self, value, arch=None, description=None):
"""
Args:
value (any): the value of the feature, such as the number or string.
arch (str): one of the VALID_ARCH values, or None.
When None, then the feature applies to any architecture.
Modifies the feature name from `feature` to `feature/arch`, like `offset/x32`.
description (str): a human-readable description that explains the feature value.
"""
super(Feature, self).__init__()
self.name = self.__class__.__name__.lower()
if arch is not None:
if arch not in VALID_ARCH:
print(value, arch, description)
raise ValueError("arch '%s' must be one of %s" % (arch, VALID_ARCH))
self.name = self.__class__.__name__.lower() + "/" + arch
else:
self.name = self.__class__.__name__.lower()
self.value = value
self.arch = arch
self.description = description
def __hash__(self):
return hash((self.name, self.value))
return hash((self.name, self.value, self.arch))
def __eq__(self, other):
return self.name == other.name and self.value == other.value
return self.name == other.name and self.value == other.value and self.arch == other.arch
# Used to overwrite the rendering of the feature value in `__str__` and the
# json output
def get_value_str(self):
"""
render the value of this feature, for use by `__str__` and friends.
subclasses should override to customize the rendering.
Returns: any
"""
return self.value
def __str__(self):
@@ -62,36 +89,44 @@ class Feature(object):
def evaluate(self, ctx):
return capa.engine.Result(self in ctx, self, [], locations=ctx.get(self, []))
def serialize(self):
return self.__dict__
def freeze_serialize(self):
return (self.__class__.__name__, [self.value])
if self.arch is not None:
return (self.__class__.__name__, [self.value, {"arch": self.arch}])
else:
return (self.__class__.__name__, [self.value])
@classmethod
def freeze_deserialize(cls, args):
return cls(*args)
# as you can see below in code,
# if the last argument is a dictionary,
# consider it to be kwargs passed to the feature constructor.
if len(args) == 1:
return cls(*args)
elif isinstance(args[-1], dict):
kwargs = args[-1]
args = args[:-1]
return cls(*args, **kwargs)
class MatchedRule(Feature):
def __init__(self, value, description=None):
super(MatchedRule, self).__init__(value, description)
super(MatchedRule, self).__init__(value, description=description)
self.name = "match"
class Characteristic(Feature):
def __init__(self, value, description=None):
super(Characteristic, self).__init__(value, description)
super(Characteristic, self).__init__(value, description=description)
class String(Feature):
def __init__(self, value, description=None):
super(String, self).__init__(value, description)
super(String, self).__init__(value, description=description)
class Regex(String):
def __init__(self, value, description=None):
super(Regex, self).__init__(value, description)
super(Regex, self).__init__(value, description=description)
pat = self.value[len("/") : -len("/")]
flags = re.DOTALL
if value.endswith("/i"):
@@ -129,13 +164,13 @@ class Regex(String):
class StringFactory(object):
def __new__(self, value, description):
if value.startswith("/") and (value.endswith("/") or value.endswith("/i")):
return Regex(value, description)
return String(value, description)
return Regex(value, description=description)
return String(value, description=description)
class Bytes(Feature):
def __init__(self, value, description=None):
super(Bytes, self).__init__(value, description)
super(Bytes, self).__init__(value, description=description)
def evaluate(self, ctx):
for feature, locations in ctx.items():

View File

@@ -12,12 +12,23 @@ import idautils
import capa.features.extractors.helpers
import capa.features.extractors.ida.helpers
from capa.features import MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic
from capa.features import ARCH_X32, ARCH_X64, MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic
from capa.features.insn import Number, Offset, Mnemonic
_file_imports_cache = None
def get_arch():
# https://reverseengineering.stackexchange.com/a/11398/17194
info = idaapi.get_inf_structure()
if info.is_64bit():
return ARCH_X64
elif info.is_32bit():
return ARCH_X32
else:
raise ValueError("unexpected architecture")
def get_imports():
""" """
global _file_imports_cache
@@ -88,6 +99,7 @@ def extract_insn_number_features(f, bb, insn):
const = capa.features.extractors.ida.helpers.mask_op_val(op)
if not idaapi.is_mapped(const):
yield Number(const), insn.ea
yield Number(const, arch=get_arch()), insn.ea
def extract_insn_bytes_features(f, bb, insn):
@@ -155,6 +167,7 @@ def extract_insn_offset_features(f, bb, insn):
op_off = capa.features.extractors.helpers.twos_complement(op_off, 32)
yield Offset(op_off), insn.ea
yield Offset(op_off, arch=get_arch()), insn.ea
def contains_stack_cookie_keywords(s):

View File

@@ -11,7 +11,7 @@ import vivisect.const
import envi.archs.i386.disasm
import capa.features.extractors.helpers
from capa.features import MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic
from capa.features import ARCH_X32, ARCH_X64, MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic
from capa.features.insn import Number, Offset, Mnemonic
from capa.features.extractors.viv.indirect_calls import NotFoundError, resolve_indirect_call
@@ -20,6 +20,14 @@ from capa.features.extractors.viv.indirect_calls import NotFoundError, resolve_i
SECURITY_COOKIE_BYTES_DELTA = 0x40
def get_arch(vw):
arch = vw.getMeta("Architecture")
if arch == "i386":
return ARCH_X32
elif arch == "amd64":
return ARCH_X64
def interface_extract_instruction_XXX(f, bb, insn):
"""
parse features from the given instruction.
@@ -138,6 +146,7 @@ def extract_insn_number_features(f, bb, insn):
return
yield Number(v), insn.va
yield Number(v, arch=get_arch(f.vw)), insn.va
def extract_insn_bytes_features(f, bb, insn):
@@ -266,8 +275,10 @@ def extract_insn_offset_features(f, bb, insn):
continue
# viv already decodes offsets as signed
v = oper.disp
yield Offset(oper.disp), insn.va
yield Offset(v), insn.va
yield Offset(v, arch=get_arch(f.vw)), insn.va
def is_security_cookie(f, bb, insn):

View File

@@ -12,16 +12,16 @@ from capa.features import Feature
class Export(Feature):
def __init__(self, value, description=None):
# value is export name
super(Export, self).__init__(value, description)
super(Export, self).__init__(value, description=description)
class Import(Feature):
def __init__(self, value, description=None):
# value is import name
super(Import, self).__init__(value, description)
super(Import, self).__init__(value, description=description)
class Section(Feature):
def __init__(self, value, description=None):
# value is section name
super(Section, self).__init__(value, description)
super(Section, self).__init__(value, description=description)

View File

@@ -20,16 +20,16 @@ class API(Feature):
class Number(Feature):
def __init__(self, value, description=None):
super(Number, self).__init__(value, description)
def __init__(self, value, arch=None, description=None):
super(Number, self).__init__(value, arch=arch, description=description)
def get_value_str(self):
return "0x%X" % self.value
class Offset(Feature):
def __init__(self, value, description=None):
super(Offset, self).__init__(value, description)
def __init__(self, value, arch=None, description=None):
super(Offset, self).__init__(value, arch=arch, description=description)
def get_value_str(self):
return "0x%X" % self.value
@@ -37,4 +37,4 @@ class Offset(Feature):
class Mnemonic(Feature):
def __init__(self, value, description=None):
super(Mnemonic, self).__init__(value, description)
super(Mnemonic, self).__init__(value, description=description)

View File

@@ -195,8 +195,14 @@ def parse_feature(key):
return capa.features.Bytes
elif key == "number":
return capa.features.insn.Number
elif key.startswith("number/"):
arch = key.partition("/")[2]
return lambda *args, **kwargs: capa.features.insn.Number(*args, arch=arch, **kwargs)
elif key == "offset":
return capa.features.insn.Offset
elif key.startswith("offset/"):
arch = key.partition("/")[2]
return lambda *args, **kwargs: capa.features.insn.Offset(*args, arch=arch, **kwargs)
elif key == "mnemonic":
return capa.features.insn.Mnemonic
elif key == "basic blocks":
@@ -325,7 +331,7 @@ def build_statements(d, scope):
# count(number(0x100 = description))
if term != "string":
value, description = parse_description(arg, term)
feature = Feature(value, description)
feature = Feature(value, description=description)
else:
# arg is string (which doesn't support inline descriptions), like:
#
@@ -358,7 +364,7 @@ def build_statements(d, scope):
Feature = parse_feature(key)
value, description = parse_description(d[key], key, d.get("description"))
try:
feature = Feature(value, description)
feature = Feature(value, description=description)
except ValueError as e:
raise InvalidRule(str(e))
ensure_feature_valid_for_scope(scope, feature)

View File

@@ -444,3 +444,15 @@ def test_match_namespace():
assert "WriteFile API" in matches
assert "file-create" not in matches
assert "filesystem-any" in matches
def test_render_number():
assert str(capa.features.insn.Number(1)) == "number(0x1)"
assert str(capa.features.insn.Number(1, arch=ARCH_X32)) == "number/x32(0x1)"
assert str(capa.features.insn.Number(1, arch=ARCH_X64)) == "number/x64(0x1)"
def test_render_offset():
assert str(capa.features.insn.Offset(1)) == "offset(0x1)"
assert str(capa.features.insn.Offset(1, arch=ARCH_X32)) == "offset/x32(0x1)"
assert str(capa.features.insn.Offset(1, arch=ARCH_X64)) == "offset/x64(0x1)"

View File

@@ -1,5 +1,6 @@
# run this script from within IDA with ./tests/data/mimikatz.exe open
import logging
import binascii
import traceback
import collections
@@ -9,6 +10,7 @@ import capa.features
import capa.features.file
import capa.features.insn
import capa.features.basicblock
from capa.features import ARCH_X32, ARCH_X64
logger = logging.getLogger("test_ida_features")
@@ -17,9 +19,14 @@ def check_input_file():
import idautils
wanted = "5f66b82558ca92e54e77f216ef4c066c"
# some versions of IDA return a truncated version of the MD5.
# some versions (7.4) of IDA return a truncated version of the MD5.
# https://github.com/idapython/bin/issues/11
found = idautils.GetInputFileMD5().rstrip(b"\x00").decode("ascii").lower()
try:
found = idautils.GetInputFileMD5()[:31].decode("ascii").lower()
except UnicodeDecodeError:
# in IDA 7.5 or so, GetInputFileMD5 started returning raw binary
# rather than the hex digest
found = binascii.hexlify(idautils.GetInputFileMD5()[:15]).decode("ascii").lower()
if not wanted.startswith(found):
raise RuntimeError("please run the tests against `mimikatz.exe`")
@@ -122,6 +129,17 @@ def test_number_features():
assert capa.features.insn.Number(0x10) not in features
@pytest.mark.skip(reason="IDA Pro tests must be run within IDA")
def test_number_arch_features():
import idaapi
f = idaapi.get_func(0x40105D)
features = extract_function_features(f)
assert capa.features.insn.Number(0xFF) in features
assert capa.features.insn.Number(0xFF, arch=ARCH_X32) in features
assert capa.features.insn.Number(0xFF, arch=ARCH_X64) not in features
@pytest.mark.skip(reason="IDA Pro tests must be run within IDA")
def test_offset_features():
import idaapi
@@ -144,6 +162,17 @@ def test_offset_features():
assert capa.features.insn.Offset(-0x2) in features
@pytest.mark.skip(reason="IDA Pro tests must be run within IDA")
def test_offset_arch_features(mimikatz):
import idaapi
f = idaapi.get_func(0x40105D)
features = extract_function_features(f)
assert capa.features.insn.Offset(0x0) in features
assert capa.features.insn.Offset(0x0, arch=ARCH_X32) in features
assert capa.features.insn.Offset(0x0, arch=ARCH_X64) not in features
@pytest.mark.skip(reason="IDA Pro tests must be run within IDA")
def test_nzxor_features():
import idaapi

View File

@@ -11,7 +11,7 @@ import textwrap
import pytest
import capa.rules
from capa.features import String
from capa.features import ARCH_X32, ARCH_X64, String
from capa.features.insn import Number, Offset
@@ -380,10 +380,10 @@ def test_number_symbol():
children = list(r.statement.get_children())
assert (Number(1) in children) == True
assert (Number(0xFFFFFFFF) in children) == True
assert (Number(2, "symbol name") in children) == True
assert (Number(3, "symbol name") in children) == True
assert (Number(4, "symbol name = another name") in children) == True
assert (Number(0x100, "symbol name") in children) == True
assert (Number(2, description="symbol name") in children) == True
assert (Number(3, description="symbol name") in children) == True
assert (Number(4, description="symbol name = another name") in children) == True
assert (Number(0x100, description="symbol name") in children) == True
def test_count_number_symbol():
@@ -403,8 +403,8 @@ def test_count_number_symbol():
assert r.evaluate({Number(2): {}}) == False
assert r.evaluate({Number(2): {1}}) == True
assert r.evaluate({Number(2): {1, 2}}) == False
assert r.evaluate({Number(0x100, "symbol name"): {1}}) == False
assert r.evaluate({Number(0x100, "symbol name"): {1, 2, 3}}) == True
assert r.evaluate({Number(0x100, description="symbol name"): {1}}) == False
assert r.evaluate({Number(0x100, description="symbol name"): {1, 2, 3}}) == True
def test_invalid_number():
@@ -448,6 +448,24 @@ def test_invalid_number():
)
def test_number_arch():
r = capa.rules.Rule.from_yaml(
textwrap.dedent(
"""
rule:
meta:
name: test rule
features:
- number/x32: 2
"""
)
)
assert r.evaluate({Number(2, arch=ARCH_X32): {1}}) == True
assert r.evaluate({Number(2): {1}}) == False
assert r.evaluate({Number(2, arch=ARCH_X64): {1}}) == False
def test_offset_symbol():
rule = textwrap.dedent(
"""
@@ -466,10 +484,10 @@ def test_offset_symbol():
r = capa.rules.Rule.from_yaml(rule)
children = list(r.statement.get_children())
assert (Offset(1) in children) == True
assert (Offset(2, "symbol name") in children) == True
assert (Offset(3, "symbol name") in children) == True
assert (Offset(4, "symbol name = another name") in children) == True
assert (Offset(0x100, "symbol name") in children) == True
assert (Offset(2, description="symbol name") in children) == True
assert (Offset(3, description="symbol name") in children) == True
assert (Offset(4, description="symbol name = another name") in children) == True
assert (Offset(0x100, description="symbol name") in children) == True
def test_count_offset_symbol():
@@ -489,8 +507,67 @@ def test_count_offset_symbol():
assert r.evaluate({Offset(2): {}}) == False
assert r.evaluate({Offset(2): {1}}) == True
assert r.evaluate({Offset(2): {1, 2}}) == False
assert r.evaluate({Offset(0x100, "symbol name"): {1}}) == False
assert r.evaluate({Offset(0x100, "symbol name"): {1, 2, 3}}) == True
assert r.evaluate({Offset(0x100, description="symbol name"): {1}}) == False
assert r.evaluate({Offset(0x100, description="symbol name"): {1, 2, 3}}) == True
def test_offset_arch():
r = capa.rules.Rule.from_yaml(
textwrap.dedent(
"""
rule:
meta:
name: test rule
features:
- offset/x32: 2
"""
)
)
assert r.evaluate({Offset(2, arch=ARCH_X32): {1}}) == True
assert r.evaluate({Offset(2): {1}}) == False
assert r.evaluate({Offset(2, arch=ARCH_X64): {1}}) == False
def test_invalid_offset():
with pytest.raises(capa.rules.InvalidRule):
r = capa.rules.Rule.from_yaml(
textwrap.dedent(
"""
rule:
meta:
name: test rule
features:
- offset: "this is a string"
"""
)
)
with pytest.raises(capa.rules.InvalidRule):
r = capa.rules.Rule.from_yaml(
textwrap.dedent(
"""
rule:
meta:
name: test rule
features:
- offset: 2=
"""
)
)
with pytest.raises(capa.rules.InvalidRule):
r = capa.rules.Rule.from_yaml(
textwrap.dedent(
"""
rule:
meta:
name: test rule
features:
- offset: symbol name = 2
"""
)
)
def test_invalid_string_values_int():
@@ -566,47 +643,6 @@ def test_regex_values_always_string():
assert capa.features.MatchedRule("test rule") in features
def test_invalid_offset():
with pytest.raises(capa.rules.InvalidRule):
r = capa.rules.Rule.from_yaml(
textwrap.dedent(
"""
rule:
meta:
name: test rule
features:
- offset: "this is a string"
"""
)
)
with pytest.raises(capa.rules.InvalidRule):
r = capa.rules.Rule.from_yaml(
textwrap.dedent(
"""
rule:
meta:
name: test rule
features:
- offset: 2=
"""
)
)
with pytest.raises(capa.rules.InvalidRule):
r = capa.rules.Rule.from_yaml(
textwrap.dedent(
"""
rule:
meta:
name: test rule
features:
- offset: symbol name = 2
"""
)
)
def test_filter_rules():
rules = capa.rules.RuleSet(
[

View File

@@ -17,6 +17,7 @@ import capa.features.extractors.viv.file
import capa.features.extractors.viv.insn
import capa.features.extractors.viv.function
import capa.features.extractors.viv.basicblock
from capa.features import ARCH_X32, ARCH_X64
def extract_file_features(vw, path):
@@ -108,6 +109,13 @@ def test_number_features(mimikatz):
assert capa.features.insn.Number(0x10) not in features
def test_number_arch_features(mimikatz):
features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x40105D))
assert capa.features.insn.Number(0xFF) in features
assert capa.features.insn.Number(0xFF, arch=ARCH_X32) in features
assert capa.features.insn.Number(0xFF, arch=ARCH_X64) not in features
def test_offset_features(mimikatz):
features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x40105D))
assert capa.features.insn.Offset(0x0) in features
@@ -125,6 +133,13 @@ def test_offset_features(mimikatz):
assert capa.features.insn.Offset(-0x2) in features
def test_offset_arch_features(mimikatz):
features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x40105D))
assert capa.features.insn.Offset(0x0) in features
assert capa.features.insn.Offset(0x0, arch=ARCH_X32) in features
assert capa.features.insn.Offset(0x0, arch=ARCH_X64) not in features
def test_nzxor_features(mimikatz):
features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x410DFC))
assert capa.features.Characteristic("nzxor") in features # 0x0410F0B