diff --git a/CHANGELOG.md b/CHANGELOG.md index 466b05f0..cf5e5121 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ ### Breaking Changes - instruction scope and operand feature are new and are not backwards compatible with older versions of capa + - Python 3.7 is now the minimum supported Python version #866 @williballenthin - remove /x32 and /x64 flavors of number and operand features #932 @williballenthin - the tool now accepts multiple paths to rules, and JSON doc updated accordingly @williballenthin diff --git a/capa/features/address.py b/capa/features/address.py index 5d96b75c..051f6694 100644 --- a/capa/features/address.py +++ b/capa/features/address.py @@ -22,6 +22,7 @@ class Address(abc.ABC): class AbsoluteVirtualAddress(int, Address): """an absolute memory address""" + def __new__(cls, v): assert v > 0 return int.__new__(cls, v) @@ -35,6 +36,7 @@ class RelativeVirtualAddress(int, Address): class FileOffsetAddress(int, Address): """an address relative to the start of a file""" + def __new__(cls, v): assert v > 0 return int.__new__(cls, v) @@ -42,6 +44,7 @@ class FileOffsetAddress(int, Address): class DNTokenAddress(Token, Address): """a .NET token""" + pass diff --git a/capa/features/extractors/ida/helpers.py b/capa/features/extractors/ida/helpers.py index a43283aa..2e2ab87c 100644 --- a/capa/features/extractors/ida/helpers.py +++ b/capa/features/extractors/ida/helpers.py @@ -382,3 +382,8 @@ def get_function_blocks(f): def is_basic_block_return(bb): """check if basic block is return block""" return bb.type == idaapi.fcb_ret + + +def has_sib(oper) -> bool: + # via: https://reverseengineering.stackexchange.com/a/14300 + return oper.specflag1 == 1 diff --git a/capa/features/extractors/ida/insn.py b/capa/features/extractors/ida/insn.py index 15942c78..97ce8f18 100644 --- a/capa/features/extractors/ida/insn.py +++ b/capa/features/extractors/ida/insn.py @@ -135,6 +135,15 @@ def extract_insn_number_features(f, bb, insn): yield Number(const), insn.ea yield OperandNumber(i, const), insn.ea + if insn.itype == idaapi.NN_add and 0 < const < MAX_STRUCTURE_SIZE and op.type == idaapi.o_imm: + # for pattern like: + # + # add eax, 0x10 + # + # assume 0x10 is also an offset (imagine eax is a pointer). + yield Offset(const), insn.ea + yield OperandOffset(i, const), insn.ea + def extract_insn_bytes_features(f, bb, insn): """parse referenced byte sequences @@ -209,6 +218,25 @@ def extract_insn_offset_features(f, bb, insn): yield Offset(op_off), insn.ea yield OperandOffset(i, op_off), insn.ea + if ( + insn.itype == idaapi.NN_lea + and i == 1 + # o_displ is used for both: + # [eax+1] + # [eax+ebx+2] + and op.type == idaapi.o_displ + # but the SIB is only present for [eax+ebx+2] + # which we don't want + and not capa.features.extractors.ida.helpers.has_sib(op) + ): + # for pattern like: + # + # lea eax, [ebx + 1] + # + # assume 1 is also an offset (imagine ebx is a zero register). + yield Number(op_off), insn.ea + yield OperandNumber(i, op_off), insn.ea + def contains_stack_cookie_keywords(s): """check if string contains stack cookie keywords diff --git a/capa/features/extractors/smda/insn.py b/capa/features/extractors/smda/insn.py index 309389e8..1635d84a 100644 --- a/capa/features/extractors/smda/insn.py +++ b/capa/features/extractors/smda/insn.py @@ -5,7 +5,7 @@ import struct from smda.common.SmdaReport import SmdaReport import capa.features.extractors.helpers -from capa.features.insn import API, Number, Offset, Mnemonic +from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Characteristic # security cookie checks may perform non-zeroing XORs, these are expected within a certain @@ -64,15 +64,25 @@ def extract_insn_number_features(f, bb, insn): # .text:00401140 call sub_407E2B # .text:00401145 add esp, 0Ch return - for operand in operands: + for i, operand in enumerate(operands): try: # The result of bitwise operations is calculated as though carried out # in two’s complement with an infinite number of sign bits value = int(operand, 16) & ((1 << f.smda_report.bitness) - 1) - - yield Number(value), insn.offset - except: + except ValueError: continue + else: + yield Number(value), insn.offset + yield OperandNumber(i, value), insn.offset + + if insn.mnemonic == "add" and 0 < value < MAX_STRUCTURE_SIZE: + # for pattern like: + # + # add eax, 0x10 + # + # assume 0x10 is also an offset (imagine eax is a pointer). + yield Offset(value), insn.offset + yield OperandOffset(i, value), insn.offset def read_bytes(smda_report, va, num_bytes=None): @@ -198,11 +208,10 @@ def extract_insn_offset_features(f, bb, insn): # mov eax, [esi + 4] # mov eax, [esi + ecx + 16384] operands = [o.strip() for o in insn.operands.split(",")] - for operand in operands: - if "ptr" not in operand: - continue + for i, operand in enumerate(operands): if "esp" in operand or "ebp" in operand or "rbp" in operand: continue + number = 0 number_hex = re.search(PATTERN_HEXNUM, operand) number_int = re.search(PATTERN_SINGLENUM, operand) @@ -212,7 +221,26 @@ def extract_insn_offset_features(f, bb, insn): elif number_int: number = int(number_int.group("num")) number = -1 * number if number_int.group().startswith("-") else number + + if "ptr" not in operand: + if ( + insn.mnemonic == "lea" + and i == 1 + and (operand.count("+") + operand.count("-")) == 1 + and operand.count("*") == 0 + ): + # for pattern like: + # + # lea eax, [ebx + 1] + # + # assume 1 is also an offset (imagine ebx is a zero register). + yield Number(number), insn.offset + yield OperandNumber(i, number), insn.offset + + continue + yield Offset(number), insn.offset + yield OperandOffset(i, number), insn.offset def is_security_cookie(f, bb, insn): diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py index cf678e2f..9df95d79 100644 --- a/capa/features/extractors/viv/insn.py +++ b/capa/features/extractors/viv/insn.py @@ -19,7 +19,7 @@ import envi.archs.amd64.disasm import capa.features.extractors.helpers import capa.features.extractors.viv.helpers -from capa.features.insn import API, Number, Offset, Mnemonic, OperandNumber, OperandOffset +from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic from capa.features.address import Address, AbsoluteVirtualAddress from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle @@ -579,6 +579,15 @@ def extract_op_number_features( yield Number(v), ihandle.address yield OperandNumber(i, v), ihandle.address + if insn.mnem == "add" and 0 < v < MAX_STRUCTURE_SIZE and isinstance(oper, envi.archs.i386.disasm.i386ImmOper): + # for pattern like: + # + # add eax, 0x10 + # + # assume 0x10 is also an offset (imagine eax is a pointer). + yield Offset(v), insn.va + yield OperandOffset(i, v), insn.va + def extract_op_offset_features(f, bb, ihandle: InsnHandle, i, oper: envi.Operand) -> Iterator[Tuple[Feature, Address]]: """parse structure offset features from the given operand.""" @@ -608,6 +617,15 @@ def extract_op_offset_features(f, bb, ihandle: InsnHandle, i, oper: envi.Operand yield Offset(v), ihandle.address yield OperandOffset(i, v), ihandle.address + if insn.mnem == "lea" and i == 1 and not f.vw.probeMemory(v, 1, envi.memory.MM_READ): + # for pattern like: + # + # lea eax, [ebx + 1] + # + # assume 1 is also an offset (imagine ebx is a zero register). + yield Number(v), insn.va + yield OperandNumber(i, v), insn.va + # like: [esi + ecx + 16384] # reg ^ ^ # index ^ diff --git a/capa/features/insn.py b/capa/features/insn.py index 28622b05..00a549f1 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -29,6 +29,10 @@ class Number(Feature): return capa.render.utils.hex(self.value) +# max recognized structure size (and therefore, offset size) +MAX_STRUCTURE_SIZE = 0x10000 + + class Offset(Feature): def __init__(self, value: int, description=None): super(Offset, self).__init__(value, description=description) diff --git a/tests/data b/tests/data index 12c64af2..11ae8d0d 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 12c64af268337d6213b603e00cb9df908c779ff9 +Subproject commit 11ae8d0d38b9703b999c988f927198c8fd132ff5 diff --git a/tests/fixtures.py b/tests/fixtures.py index 18400ca7..d7d7f4b8 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -37,7 +37,8 @@ from capa.features.common import ( ) CD = os.path.dirname(__file__) -DNFILE_TESTFILES = "dnfile-testfiles" +DOTNET_DIR = os.path.join(CD, "data", "dotnet") +DNFILE_TESTFILES = os.path.join(DOTNET_DIR, "dnfile-testfiles") @contextlib.contextmanager @@ -181,6 +182,14 @@ def extract_basic_block_features(extractor, f, bb): return features +# f may not be hashable (e.g. ida func_t) so cannot @lru_cache this +def extract_instruction_features(extractor, f, bb, insn): + features = collections.defaultdict(set) + for feature, va in extractor.extract_insn_features(f, bb, insn): + features[feature].add(va) + return features + + # note: too reduce the testing time it's recommended to reuse already existing test samples, if possible def get_data_path_by_name(name): if name == "mimikatz": @@ -234,7 +243,7 @@ def get_data_path_by_name(name): elif name.startswith("b9f5b"): return os.path.join(CD, "data", "b9f5bd514485fb06da39beff051b9fdc.exe_") elif name.startswith("mixed-mode-64"): - return os.path.join(CD, "data", DNFILE_TESTFILES, "mixed-mode", "ModuleCode", "bin", "ModuleCode_amd64.exe") + return os.path.join(DNFILE_TESTFILES, "mixed-mode", "ModuleCode", "bin", "ModuleCode_amd64.exe") else: raise ValueError("unexpected sample fixture: %s" % name) @@ -317,6 +326,13 @@ def get_basic_block(extractor, f, va): raise ValueError("basic block not found") +def get_instruction(extractor, f, bb, va): + for insn in extractor.get_instructions(f, bb): + if int(insn) == va: + return insn + raise ValueError("instruction not found") + + def resolve_scope(scope): if scope == "file": @@ -328,8 +344,32 @@ def resolve_scope(scope): inner_file.__name__ = scope return inner_file + elif "insn=" in scope: + # like `function=0x401000,bb=0x40100A,insn=0x40100A` + assert "function=" in scope + assert "bb=" in scope + assert "insn=" in scope + fspec, _, spec = scope.partition(",") + bbspec, _, ispec = spec.partition(",") + fva = int(fspec.partition("=")[2], 0x10) + bbva = int(bbspec.partition("=")[2], 0x10) + iva = int(ispec.partition("=")[2], 0x10) + + def inner_insn(extractor): + f = get_function(extractor, fva) + bb = get_basic_block(extractor, f, bbva) + insn = get_instruction(extractor, f, bb, iva) + features = extract_instruction_features(extractor, f, bb, insn) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features + + inner_insn.__name__ = scope + return inner_insn elif "bb=" in scope: # like `function=0x401000,bb=0x40100A` + assert "function=" in scope + assert "bb=" in scope fspec, _, bbspec = scope.partition(",") fva = int(fspec.partition("=")[2], 0x10) bbva = int(bbspec.partition("=")[2], 0x10) @@ -459,6 +499,30 @@ FEATURE_PRESENCE_TESTS = sorted( # insn/offset: negative ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x1), True), ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x2), True), + # + # insn/offset from mnemonic: add + # + # should not be considered, too big for an offset: + # .text:00401D85 81 C1 00 00 00 80 add ecx, 80000000h + ("mimikatz", "function=0x401D64,bb=0x401D73,insn=0x401D85", capa.features.insn.Offset(0x80000000), False), + # should not be considered, relative to stack: + # .text:00401CF6 83 C4 10 add esp, 10h + ("mimikatz", "function=0x401CC7,bb=0x401CDE,insn=0x401CF6", capa.features.insn.Offset(0x10), False), + # yes, this is also a offset (imagine eax is a pointer): + # .text:0040223C 83 C0 04 add eax, 4 + ("mimikatz", "function=0x402203,bb=0x402221,insn=0x40223C", capa.features.insn.Offset(0x4), True), + # + # insn/number from mnemonic: lea + # + # should not be considered, lea operand invalid encoding + # .text:00471EE6 8D 1C 81 lea ebx, [ecx+eax*4] + ("mimikatz", "function=0x471EAB,bb=0x471ED8,insn=0x471EE6", capa.features.insn.Number(0x4), False), + # should not be considered, lea operand invalid encoding + # .text:004717B1 8D 4C 31 D0 lea ecx, [ecx+esi-30h] + ("mimikatz", "function=0x47153B,bb=0x4717AB,insn=0x4717B1", capa.features.insn.Number(-0x30), False), + # yes, this is also a number (imagine edx is zero): + # .text:004018C0 8D 4B 02 lea ecx, [ebx+2] + ("mimikatz", "function=0x401873,bb=0x4018B2,insn=0x4018C0", capa.features.insn.Number(0x2), True), # insn/api ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContextW"), True), ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContext"), True), diff --git a/tests/test_smda_features.py b/tests/test_smda_features.py index 3873d71b..6614c24d 100644 --- a/tests/test_smda_features.py +++ b/tests/test_smda_features.py @@ -22,14 +22,6 @@ def test_smda_features(sample, scope, feature, expected): if scope.__name__ == "file" and isinstance(feature, capa.features.file.FunctionName) and expected is True: pytest.xfail("SMDA has no function ID") - if "bb=" in scope.__name__ and isinstance(feature, capa.features.insn.OperandNumber) and expected is True: - # SMDA not currently maintained, see: https://github.com/mandiant/capa/issues/937 - pytest.xfail("SMDA doesn't support operand numbers") - - if "bb=" in scope.__name__ and isinstance(feature, capa.features.insn.OperandOffset) and expected is True: - # SMDA not currently maintained, see: https://github.com/mandiant/capa/issues/937 - pytest.xfail("SMDA doesn't support operand offsets") - fixtures.do_test_feature_presence(fixtures.get_smda_extractor, sample, scope, feature, expected)