From 43c6eec30bd69a7ebb292a3740e1670c1082363c Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 8 Aug 2020 12:48:56 -0600 Subject: [PATCH 01/44] extractors: begin to implement lancelot backend --- capa/features/extractors/lancelot/__init__.py | 90 ++++ .../extractors/lancelot/basicblock.py | 117 +++++ capa/features/extractors/lancelot/file.py | 82 ++++ capa/features/extractors/lancelot/function.py | 55 +++ capa/features/extractors/lancelot/insn.py | 191 +++++++++ tests/test_lancelot_features.py | 401 ++++++++++++++++++ tests/test_viv_features.py | 2 + 7 files changed, 938 insertions(+) create mode 100644 capa/features/extractors/lancelot/__init__.py create mode 100644 capa/features/extractors/lancelot/basicblock.py create mode 100644 capa/features/extractors/lancelot/file.py create mode 100644 capa/features/extractors/lancelot/function.py create mode 100644 capa/features/extractors/lancelot/insn.py create mode 100644 tests/test_lancelot_features.py diff --git a/capa/features/extractors/lancelot/__init__.py b/capa/features/extractors/lancelot/__init__.py new file mode 100644 index 00000000..8be8c054 --- /dev/null +++ b/capa/features/extractors/lancelot/__init__.py @@ -0,0 +1,90 @@ +# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging + +import lancelot + +import capa.features.extractors +import capa.features.extractors.lancelot.file +import capa.features.extractors.lancelot.insn +import capa.features.extractors.lancelot.function +import capa.features.extractors.lancelot.basicblock + +__all__ = ["file", "function", "basicblock", "insn"] +logger = logging.getLogger(__name__) + + +class BB(object): + """extend the lancelot.BasicBlock with an __int__ method to access the address""" + + def __init__(self, ws, bb): + super(BB, self).__init__() + self.ws = ws + self.address = bb.address + self.length = bb.length + self.predecessors = bb.predecessors + self.successors = bb.successors + + def __int__(self): + return self.address + + @property + def instructions(self): + va = self.address + while va <= self.address + self.length: + try: + insn = self.ws.read_insn(va) + except ValueError: + logger.warning("failed to read instruction at 0x%x", va) + return + + yield insn + va += insn.length + + +class LancelotFeatureExtractor(capa.features.extractors.FeatureExtractor): + def __init__(self, buf): + super(LancelotFeatureExtractor, self).__init__() + self.buf = buf + self.ws = lancelot.from_bytes(buf) + + def get_base_address(self): + return self.ws.base_address + + def extract_file_features(self): + for feature, va in capa.features.extractors.lancelot.file.extract_file_features(self.buf): + yield feature, va + + def get_functions(self): + for va in self.ws.get_functions(): + yield va + + def extract_function_features(self, f): + for feature, va in capa.features.extractors.lancelot.function.extract_function_features(self.ws, f): + yield feature, va + + def get_basic_blocks(self, f): + try: + cfg = self.ws.build_cfg(f) + except: + logger.warning("failed to build CFG for 0x%x", f) + return + else: + for bb in cfg.basic_blocks.values(): + yield BB(self.ws, bb) + + def extract_basic_block_features(self, f, bb): + for feature, va in capa.features.extractors.lancelot.basicblock.extract_basic_block_features(self.ws, bb): + yield feature, va + + def get_instructions(self, f, bb): + return bb.instructions + + def extract_insn_features(self, f, bb, insn): + for feature, va in capa.features.extractors.lancelot.insn.extract_insn_features(self.ws, insn): + yield feature, va diff --git a/capa/features/extractors/lancelot/basicblock.py b/capa/features/extractors/lancelot/basicblock.py new file mode 100644 index 00000000..e08e67fc --- /dev/null +++ b/capa/features/extractors/lancelot/basicblock.py @@ -0,0 +1,117 @@ +import string +import struct +import logging + +from lancelot import ( + FLOW_VA, + OPERAND_SIZE, + OPERAND_TYPE, + MEMORY_OPERAND_BASE, + OPERAND_TYPE_MEMORY, + OPERAND_TYPE_IMMEDIATE, + IMMEDIATE_OPERAND_VALUE, +) + +from capa.features import Characteristic +from capa.features.basicblock import BasicBlock +from capa.features.extractors.helpers import MIN_STACKSTRING_LEN + +logger = logging.getLogger(__name__) + + +def extract_bb_tight_loop(ws, bb): + """ check basic block for tight loop indicators """ + if bb.address in map(lambda flow: flow[FLOW_VA], bb.successors): + yield Characteristic("tight loop"), bb.address + + +def is_mov_imm_to_stack(insn): + if not insn.mnemonic.startswith("mov"): + return False + + try: + dst, src = insn.operands + except ValueError: + # not two operands + return False + + if src[OPERAND_TYPE] != OPERAND_TYPE_IMMEDIATE: + return False + + if not dst[OPERAND_TYPE] != OPERAND_TYPE_MEMORY: + return False + + if dst[MEMORY_OPERAND_BASE] not in ("ebp", "rbp", "esp", "rsp"): + return False + + return True + + +def is_printable_ascii(chars): + return all(c < 127 and chr(c) in string.printable for c in chars) + + +def is_printable_utf16le(chars): + if all(c == b"\x00" for c in chars[1::2]): + return is_printable_ascii(chars[::2]) + + +def get_printable_len(operand): + """ + Return string length if all operand bytes are ascii or utf16-le printable + """ + operand_size = operand[OPERAND_SIZE] + if operand_size == 8: + chars = struct.pack(" MIN_STACKSTRING_LEN: + return True + + return False + + +def extract_stackstring(ws, bb): + """ check basic block for stackstring indicators """ + if _bb_has_stackstring(ws, bb): + yield Characteristic("stack string"), bb.va + + +def extract_basic_block_features(ws, bb): + yield BasicBlock(), bb.address + for bb_handler in BASIC_BLOCK_HANDLERS: + for feature, va in bb_handler(ws, bb): + yield feature, va + + +BASIC_BLOCK_HANDLERS = ( + extract_bb_tight_loop, + extract_stackstring, +) diff --git a/capa/features/extractors/lancelot/file.py b/capa/features/extractors/lancelot/file.py new file mode 100644 index 00000000..8196e886 --- /dev/null +++ b/capa/features/extractors/lancelot/file.py @@ -0,0 +1,82 @@ +import pefile + +import capa.features.extractors.strings +from capa.features import String, Characteristic +from capa.features.file import Export, Import, Section + + +def extract_file_embedded_pe(buf, pe): + buf = buf[2:] + + total_offset = 2 + while True: + try: + offset = buf.index(b"MZ") + except ValueError: + return + else: + rest = buf[offset:] + total_offset += offset + + try: + _ = pefile.PE(data=rest) + except: + pass + else: + yield Characteristic("embedded pe"), total_offset + + buf = rest[2:] + total_offset += 2 + + +def extract_file_export_names(buf, pe): + if not hasattr(pe, "DIRECTORY_ENTRY_EXPORT"): + return + + base_address = pe.OPTIONAL_HEADER.ImageBase + for exp in pe.DIRECTORY_ENTRY_EXPORT.symbols: + yield Export(exp.name.decode("ascii")), base_address + exp.address + + +def extract_file_import_names(buf, pe): + base_address = pe.OPTIONAL_HEADER.ImageBase + for entry in pe.DIRECTORY_ENTRY_IMPORT: + libname = entry.dll.decode("ascii").lower().partition(".")[0] + for imp in entry.imports: + impaddr = base_address + imp.address + if imp.ordinal: + yield Import("%s.#%s" % (libname, imp.ordinal)), impaddr + else: + impname = imp.name.decode("ascii") + yield Import("%s.%s" % (libname, impname)), impaddr + yield Import("%s" % (impname)), impaddr + + +def extract_file_section_names(buf, pe): + base_address = pe.OPTIONAL_HEADER.ImageBase + for section in pe.sections: + yield Section(section.Name.partition(b"\x00")[0].decode("ascii")), base_address + section.VirtualAddress + + +def extract_file_strings(buf, pe): + for s in capa.features.extractors.strings.extract_ascii_strings(buf): + yield String(s.s), s.offset + + for s in capa.features.extractors.strings.extract_unicode_strings(buf): + yield String(s.s), s.offset + + +def extract_file_features(buf): + pe = pefile.PE(data=buf) + for file_handler in FILE_HANDLERS: + for feature, va in file_handler(buf, pe): + yield feature, va + + +FILE_HANDLERS = ( + extract_file_embedded_pe, + extract_file_export_names, + extract_file_import_names, + extract_file_section_names, + extract_file_strings, +) diff --git a/capa/features/extractors/lancelot/function.py b/capa/features/extractors/lancelot/function.py new file mode 100644 index 00000000..04bb699a --- /dev/null +++ b/capa/features/extractors/lancelot/function.py @@ -0,0 +1,55 @@ +import logging + +from lancelot import ( + FLOW_VA, + FLOW_TYPE, + FLOW_TYPE_CONDITIONAL_JUMP, + FLOW_TYPE_CONDITIONAL_MOVE, + FLOW_TYPE_UNCONDITIONAL_JUMP, +) + +from capa.features import Characteristic +from capa.features.extractors import loops + +logger = logging.getLogger(__name__) + + +def extract_function_switch(ws, f): + return [] + + +def extract_function_calls_to(ws, f): + return [] + + +def extract_function_loop(ws, f): + edges = [] + for bb in ws.build_cfg(f).basic_blocks.values(): + for flow in bb.successors: + if flow[FLOW_TYPE] in ( + FLOW_TYPE_UNCONDITIONAL_JUMP, + FLOW_TYPE_CONDITIONAL_JUMP, + FLOW_TYPE_CONDITIONAL_MOVE, + ): + edges.append((bb.address, flow[FLOW_VA])) + continue + + if edges and loops.has_loop(edges): + yield Characteristic("loop"), f + + +FUNCTION_HANDLERS = (extract_function_switch, extract_function_calls_to, extract_function_loop) + + +_not_implemented = set([]) + + +def extract_function_features(ws, f): + for func_handler in FUNCTION_HANDLERS: + try: + for feature, va in func_handler(ws, f): + yield feature, va + except NotImplementedError: + if func_handler.__name__ not in _not_implemented: + logger.warning("not implemented: %s", func_handler.__name__) + _not_implemented.add(func_handler.__name__) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py new file mode 100644 index 00000000..6028d453 --- /dev/null +++ b/capa/features/extractors/lancelot/insn.py @@ -0,0 +1,191 @@ +import logging + +from lancelot import ( + OPERAND_TYPE, + PERMISSION_READ, + OPERAND_TYPE_REGISTER, + OPERAND_TYPE_IMMEDIATE, + IMMEDIATE_OPERAND_VALUE, + REGISTER_OPERAND_REGISTER, +) + +from capa.features import ARCH_X32, ARCH_X64 +from capa.features.insn import Number + +logger = logging.getLogger(__name__) + + +# security cookie checks may perform non-zeroing XORs, these are expected within a certain +# byte range within the first and returning basic blocks, this helps to reduce FP features +SECURITY_COOKIE_BYTES_DELTA = 0x40 + + +def get_arch(ws): + if ws.arch == "x32": + return ARCH_X32 + elif ws.arch == "x64": + return ARCH_X64 + else: + raise ValueError("unexpected architecture") + + +def get_imports(ws): + """caching accessor""" + raise NotImplementedError() + + +def extract_insn_api_features(ws, insn): + """parse API features from the given instruction.""" + raise NotImplementedError() + + +def extract_insn_number_features(ws, insn): + """parse number features from the given instruction.""" + operands = insn.operands + + for oper in operands: + if oper[OPERAND_TYPE] != OPERAND_TYPE_IMMEDIATE: + continue + + v = oper[IMMEDIATE_OPERAND_VALUE] + + if ws.probe(v) & PERMISSION_READ: + # v is a valid address + # therefore, assume its not also a constant. + continue + + if ( + insn.mnemonic == "add" + and operands[0][OPERAND_TYPE] == OPERAND_TYPE_REGISTER + and operands[0][REGISTER_OPERAND_REGISTER] == "esp" + ): + # skip things like: + # + # .text:00401140 call sub_407E2B + # .text:00401145 add esp, 0Ch + return + + yield Number(v), insn.address + yield Number(v, arch=get_arch(ws)), insn.address + + +def derefs(ws, p): + """ + recursively follow the given pointer, yielding the valid memory addresses along the way. + useful when you may have a pointer to string, or pointer to pointer to string, etc. + this is a "do what i mean" type of helper function. + """ + raise NotImplementedError() + + +def read_bytes(ws, va): + """ + read up to MAX_BYTES_FEATURE_SIZE from the given address. + """ + raise NotImplementedError() + + +def extract_insn_bytes_features(ws, insn): + """ + parse byte sequence features from the given instruction. + """ + raise NotImplementedError() + + +def read_string(ws, va): + raise NotImplementedError() + + +def extract_insn_string_features(ws, insn): + """parse string features from the given instruction.""" + raise NotImplementedError() + + +def extract_insn_offset_features(ws, insn): + """parse structure offset features from the given instruction.""" + raise NotImplementedError() + + +def is_security_cookie(ws, insn): + """ + check if an instruction is related to security cookie checks + """ + raise NotImplementedError() + + +def extract_insn_nzxor_characteristic_features(ws, insn): + """ + parse non-zeroing XOR instruction from the given instruction. + ignore expected non-zeroing XORs, e.g. security cookies. + """ + raise NotImplementedError() + + +def extract_insn_mnemonic_features(ws, insn): + """parse mnemonic features from the given instruction.""" + raise NotImplementedError() + + +def extract_insn_peb_access_characteristic_features(ws, insn): + """ + parse peb access from the given function. fs:[0x30] on x86, gs:[0x60] on x64 + """ + raise NotImplementedError() + + +def extract_insn_segment_access_features(ws, insn): + """ parse the instruction for access to fs or gs """ + raise NotImplementedError() + + +def extract_insn_cross_section_cflow(ws, insn): + """ + inspect the instruction for a CALL or JMP that crosses section boundaries. + """ + raise NotImplementedError() + + +# this is a feature that's most relevant at the function scope, +# however, its most efficient to extract at the instruction scope. +def extract_function_calls_from(ws, insn): + raise NotImplementedError() + + +# this is a feature that's most relevant at the function or basic block scope, +# however, its most efficient to extract at the instruction scope. +def extract_function_indirect_call_characteristic_features(ws, insn): + """ + extract indirect function call characteristic (e.g., call eax or call dword ptr [edx+4]) + does not include calls like => call ds:dword_ABD4974 + """ + raise NotImplementedError() + + +_not_implemented = set([]) + + +def extract_insn_features(ws, insn): + for insn_handler in INSTRUCTION_HANDLERS: + try: + for feature, va in insn_handler(ws, insn): + yield feature, va + except NotImplementedError: + if insn_handler.__name__ not in _not_implemented: + logger.warning("not implemented: %s", insn_handler.__name__) + _not_implemented.add(insn_handler.__name__) + + +INSTRUCTION_HANDLERS = ( + extract_insn_api_features, + extract_insn_number_features, + extract_insn_string_features, + extract_insn_bytes_features, + extract_insn_offset_features, + extract_insn_nzxor_characteristic_features, + extract_insn_mnemonic_features, + extract_insn_peb_access_characteristic_features, + extract_insn_cross_section_cflow, + extract_insn_segment_access_features, + extract_function_calls_from, + extract_function_indirect_call_characteristic_features, +) diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py new file mode 100644 index 00000000..f4fd69ab --- /dev/null +++ b/tests/test_lancelot_features.py @@ -0,0 +1,401 @@ +# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import os.path +import collections + +try: + from functools import lru_cache +except ImportError: + # pip install backports.functools-lru-cache + from backports.functools_lru_cache import lru_cache + +import pytest + +import capa.features +import capa.features.file +import capa.features.insn +import capa.features.basicblock +import capa.features.extractors.lancelot.file +import capa.features.extractors.lancelot.insn +import capa.features.extractors.lancelot.function +import capa.features.extractors.lancelot.basicblock +from capa.features import ARCH_X32, ARCH_X64 + +CD = os.path.dirname(__file__) + + +@pytest.fixture +def mimikatz(): + return os.path.join(CD, "data", "mimikatz.exe_") + + +@pytest.fixture +def kernel32(): + return os.path.join(CD, "data", "kernel32.dll_") + + +@lru_cache +def extract_file_features(extractor): + features = set([]) + for feature, va in extractor.extract_file_features(): + features.add(feature) + return features + + +@lru_cache +def extract_function_features(f): + features = collections.defaultdict(set) + for bb in f.basic_blocks: + for insn in bb.instructions: + for feature, va in capa.features.extractors.lancelot.insn.extract_features(f, bb, insn): + features[feature].add(va) + for feature, va in capa.features.extractors.lancelot.basicblock.extract_features(f, bb): + features[feature].add(va) + for feature, va in capa.features.extractors.lancelot.function.extract_features(f): + features[feature].add(va) + return features + + +@lru_cache +def extract_basic_block_features(f, bb): + features = set({}) + for insn in bb.instructions: + for feature, _ in capa.features.extractors.lancelot.insn.extract_features(f, bb, insn): + features.add(feature) + for feature, _ in capa.features.extractors.lancelot.basicblock.extract_features(f, bb): + features.add(feature) + return features + + +@lru_cache +def get_lancelot_extractor(path): + with open(path, "rb") as f: + buf = f.read() + + return capa.features.extractors.lancelot.LancelotFeatureExtractor(buf) + + +@pytest.fixture +def sample(request): + if request.param == "mimikatz": + return os.path.join(CD, "data", "mimikatz.exe_") + elif request.param == "kernel32": + return os.path.join(CD, "data", "kernel32.dll_") + else: + raise ValueError("unexpected sample fixture") + + +@pytest.fixture +def scope(request): + if request.param == "file": + return extract_file_features + else: + raise ValueError("unexpected scope fixture") + + +@pytest.mark.parametrize( + "sample,scope,feature,expected", + [ + # sections + ("mimikatz", "file", capa.features.file.Section(".rsrc"), True), + ("mimikatz", "file", capa.features.file.Section(".text"), True), + ("mimikatz", "file", capa.features.file.Section(".nope"), False), + # exports + ("kernel32", "file", capa.features.file.Export("BaseThreadInitThunk"), True), + ("kernel32", "file", capa.features.file.Export("lstrlenW"), True), + ("kernel32", "file", capa.features.file.Export("nope"), False), + # imports + ("mimikatz", "file", capa.features.file.Import("advapi32.CryptSetHashParam"), True), + ("mimikatz", "file", capa.features.file.Import("CryptSetHashParam"), True), + ("mimikatz", "file", capa.features.file.Import("kernel32.IsWow64Process"), True), + ("mimikatz", "file", capa.features.file.Import("msvcrt.exit"), True), + ("mimikatz", "file", capa.features.file.Import("cabinet.#11"), True), + ("mimikatz", "file", capa.features.file.Import("#11"), False), + ("mimikatz", "file", capa.features.file.Import("#nope"), False), + ("mimikatz", "file", capa.features.file.Import("nope"), False), + ], + indirect=["sample", "scope"], +) +def test_file_section_features(sample, scope, feature, expected): + extractor = get_lancelot_extractor(sample) + features = scope(extractor) + assert (feature in features) == expected + + +""" +def test_api_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x403BAC)) + assert capa.features.insn.API("advapi32.CryptAcquireContextW") in features + assert capa.features.insn.API("advapi32.CryptAcquireContext") in features + assert capa.features.insn.API("advapi32.CryptGenKey") in features + assert capa.features.insn.API("advapi32.CryptImportKey") in features + assert capa.features.insn.API("advapi32.CryptDestroyKey") in features + assert capa.features.insn.API("CryptAcquireContextW") in features + assert capa.features.insn.API("CryptAcquireContext") in features + assert capa.features.insn.API("CryptGenKey") in features + assert capa.features.insn.API("CryptImportKey") in features + assert capa.features.insn.API("CryptDestroyKey") in features + + +def test_api_features_64_bit(sample_a198216798ca38f280dc413f8c57f2c2): + features = extract_function_features(lancelot_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.ws, 0x4011B0)) + assert capa.features.insn.API("kernel32.GetStringTypeA") in features + assert capa.features.insn.API("kernel32.GetStringTypeW") not in features + assert capa.features.insn.API("kernel32.GetStringType") in features + assert capa.features.insn.API("GetStringTypeA") in features + assert capa.features.insn.API("GetStringType") in features + # call via thunk in IDA Pro + features = extract_function_features(lancelot_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.ws, 0x401CB0)) + assert capa.features.insn.API("msvcrt.vfprintf") in features + assert capa.features.insn.API("vfprintf") in features + + +def test_string_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) + assert capa.features.String("SCardControl") in features + assert capa.features.String("SCardTransmit") in features + assert capa.features.String("ACR > ") in features + # other strings not in this function + assert capa.features.String("bcrypt.dll") not in features + + +def test_string_pointer_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x44EDEF)) + assert capa.features.String("INPUTEVENT") in features + + +def test_byte_features(sample_9324d1a8ae37a36ae560c37448c9705a): + features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x406F60)) + wanted = capa.features.Bytes(b"\xED\x24\x9E\xF4\x52\xA9\x07\x47\x55\x8E\xE1\xAB\x30\x8E\x23\x61") + # use `==` rather than `is` because the result is not `True` but a truthy value. + assert wanted.evaluate(features) == True + + +def test_byte_features64(sample_lab21_01): + features = extract_function_features(lancelot_utils.Function(sample_lab21_01.ws, 0x1400010C0)) + wanted = capa.features.Bytes(b"\x32\xA2\xDF\x2D\x99\x2B\x00\x00") + # use `==` rather than `is` because the result is not `True` but a truthy value. + assert wanted.evaluate(features) == True + + +def test_bytes_pointer_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x44EDEF)) + assert capa.features.Bytes("INPUTEVENT".encode("utf-16le")).evaluate(features) == True + + +def test_number_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) + assert capa.features.insn.Number(0xFF) in features + assert capa.features.insn.Number(0x3136B0) in features + # the following are stack adjustments + assert capa.features.insn.Number(0xC) not in features + assert capa.features.insn.Number(0x10) not in features + + +def test_number_arch_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) + assert capa.features.insn.Number(0xFF) in features + assert capa.features.insn.Number(0xFF, arch=ARCH_X32) in features + assert capa.features.insn.Number(0xFF, arch=ARCH_X64) not in features + + +def test_offset_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) + assert capa.features.insn.Offset(0x0) in features + assert capa.features.insn.Offset(0x4) in features + assert capa.features.insn.Offset(0xC) in features + # the following are stack references + assert capa.features.insn.Offset(0x8) not in features + assert capa.features.insn.Offset(0x10) not in features + + # this function has the following negative offsets + # movzx ecx, byte ptr [eax-1] + # movzx eax, byte ptr [eax-2] + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x4011FB)) + assert capa.features.insn.Offset(-0x1) in features + assert capa.features.insn.Offset(-0x2) in features + + +def test_offset_arch_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) + assert capa.features.insn.Offset(0x0) in features + assert capa.features.insn.Offset(0x0, arch=ARCH_X32) in features + assert capa.features.insn.Offset(0x0, arch=ARCH_X64) not in features + + +def test_nzxor_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x410DFC)) + assert capa.features.Characteristic("nzxor") in features # 0x0410F0B + + +def get_bb_insn(f, va): + # fetch the BasicBlock and Instruction instances for the given VA in the given function. + for bb in f.basic_blocks: + for insn in bb.instructions: + if insn.va == va: + return (bb, insn) + raise KeyError(va) + + +def test_is_security_cookie(mimikatz): + # not a security cookie check + f = lancelot_utils.Function(mimikatz.ws, 0x410DFC) + for va in [0x0410F0B]: + bb, insn = get_bb_insn(f, va) + assert capa.features.extractors.lancelot.insn.is_security_cookie(f, bb, insn) == False + + # security cookie initial set and final check + f = lancelot_utils.Function(mimikatz.ws, 0x46C54A) + for va in [0x46C557, 0x46C63A]: + bb, insn = get_bb_insn(f, va) + assert capa.features.extractors.lancelot.insn.is_security_cookie(f, bb, insn) == True + + +def test_mnemonic_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) + assert capa.features.insn.Mnemonic("push") in features + assert capa.features.insn.Mnemonic("movzx") in features + assert capa.features.insn.Mnemonic("xor") in features + + assert capa.features.insn.Mnemonic("in") not in features + assert capa.features.insn.Mnemonic("out") not in features + + +def test_peb_access_features(sample_a933a1a402775cfa94b6bee0963f4b46): + features = extract_function_features(lancelot_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.ws, 0xABA6FEC)) + assert capa.features.Characteristic("peb access") in features + + +def test_tight_loop_features(mimikatz): + f = lancelot_utils.Function(mimikatz.ws, 0x402EC4) + for bb in f.basic_blocks: + if bb.va != 0x402F8E: + continue + features = extract_basic_block_features(f, bb) + assert capa.features.Characteristic("tight loop") in features + assert capa.features.basicblock.BasicBlock() in features + + +def test_tight_loop_bb_features(mimikatz): + f = lancelot_utils.Function(mimikatz.ws, 0x402EC4) + for bb in f.basic_blocks: + if bb.va != 0x402F8E: + continue + features = extract_basic_block_features(f, bb) + assert capa.features.Characteristic("tight loop") in features + assert capa.features.basicblock.BasicBlock() in features + + +def test_cross_section_flow_features(sample_a198216798ca38f280dc413f8c57f2c2): + features = extract_function_features(lancelot_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.ws, 0x4014D0)) + assert capa.features.Characteristic("cross section flow") in features + + # this function has calls to some imports, + # which should not trigger cross-section flow characteristic + features = extract_function_features(lancelot_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.ws, 0x401563)) + assert capa.features.Characteristic("cross section flow") not in features + + +def test_segment_access_features(sample_a933a1a402775cfa94b6bee0963f4b46): + features = extract_function_features(lancelot_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.ws, 0xABA6FEC)) + assert capa.features.Characteristic("fs access") in features + + +def test_thunk_features(sample_9324d1a8ae37a36ae560c37448c9705a): + features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x407970)) + assert capa.features.insn.API("kernel32.CreateToolhelp32Snapshot") in features + assert capa.features.insn.API("CreateToolhelp32Snapshot") in features + + +def test_file_embedded_pe(pma_lab_12_04): + features = extract_file_features(pma_lab_12_04.ws, pma_lab_12_04.path) + assert capa.features.Characteristic("embedded pe") in features + + +def test_stackstring_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x4556E5)) + assert capa.features.Characteristic("stack string") in features + + +def test_switch_features(mimikatz): + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x409411)) + assert capa.features.Characteristic("switch") in features + + features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x409393)) + assert capa.features.Characteristic("switch") not in features + + +def test_recursive_call_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41): + features = extract_function_features( + lancelot_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.ws, 0x10003100) + ) + assert capa.features.Characteristic("recursive call") in features + + features = extract_function_features( + lancelot_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.ws, 0x10007B00) + ) + assert capa.features.Characteristic("recursive call") not in features + + +def test_loop_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41): + features = extract_function_features( + lancelot_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.ws, 0x10003D30) + ) + assert capa.features.Characteristic("loop") in features + + features = extract_function_features( + lancelot_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.ws, 0x10007250) + ) + assert capa.features.Characteristic("loop") not in features + + +def test_file_string_features(sample_bfb9b5391a13d0afd787e87ab90f14f5): + features = extract_file_features( + sample_bfb9b5391a13d0afd787e87ab90f14f5.ws, sample_bfb9b5391a13d0afd787e87ab90f14f5.path, + ) + assert capa.features.String("WarStop") in features # ASCII, offset 0x40EC + assert capa.features.String("cimage/png") in features # UTF-16 LE, offset 0x350E + + +def test_function_calls_to(sample_9324d1a8ae37a36ae560c37448c9705a): + features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x406F60)) + assert capa.features.Characteristic("calls to") in features + assert len(features[capa.features.Characteristic("calls to")]) == 1 + + +def test_function_calls_to64(sample_lab21_01): + features = extract_function_features(lancelot_utils.Function(sample_lab21_01.ws, 0x1400052D0)) # memcpy + assert capa.features.Characteristic("calls to") in features + assert len(features[capa.features.Characteristic("calls to")]) == 8 + + +def test_function_calls_from(sample_9324d1a8ae37a36ae560c37448c9705a): + features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x406F60)) + assert capa.features.Characteristic("calls from") in features + assert len(features[capa.features.Characteristic("calls from")]) == 23 + + +def test_basic_block_count(sample_9324d1a8ae37a36ae560c37448c9705a): + features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x406F60)) + assert len(features[capa.features.basicblock.BasicBlock()]) == 26 + + +def test_indirect_call_features(sample_a933a1a402775cfa94b6bee0963f4b46): + features = extract_function_features(lancelot_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.ws, 0xABA68A0)) + assert capa.features.Characteristic("indirect call") in features + assert len(features[capa.features.Characteristic("indirect call")]) == 3 + + +def test_indirect_calls_resolved(sample_c91887d861d9bd4a5872249b641bc9f9): + features = extract_function_features(lancelot_utils.Function(sample_c91887d861d9bd4a5872249b641bc9f9.ws, 0x401A77)) + assert capa.features.insn.API("kernel32.CreatePipe") in features + assert capa.features.insn.API("kernel32.SetHandleInformation") in features + assert capa.features.insn.API("kernel32.CloseHandle") in features + assert capa.features.insn.API("kernel32.WriteFile") in features +""" diff --git a/tests/test_viv_features.py b/tests/test_viv_features.py index ce480a7e..5b8b0662 100644 --- a/tests/test_viv_features.py +++ b/tests/test_viv_features.py @@ -6,6 +6,8 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import collections + import viv_utils from fixtures import * From 9bde11fa6fea1c431ccd091e27faddff78793083 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 8 Aug 2020 13:51:34 -0600 Subject: [PATCH 02/44] extractor: lancelot: fix stackstring --- capa/features/extractors/lancelot/basicblock.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/capa/features/extractors/lancelot/basicblock.py b/capa/features/extractors/lancelot/basicblock.py index e08e67fc..b60ea86f 100644 --- a/capa/features/extractors/lancelot/basicblock.py +++ b/capa/features/extractors/lancelot/basicblock.py @@ -38,7 +38,10 @@ def is_mov_imm_to_stack(insn): if src[OPERAND_TYPE] != OPERAND_TYPE_IMMEDIATE: return False - if not dst[OPERAND_TYPE] != OPERAND_TYPE_MEMORY: + if src[IMMEDIATE_OPERAND_VALUE] < 0: + return False + + if dst[OPERAND_TYPE] != OPERAND_TYPE_MEMORY: return False if dst[MEMORY_OPERAND_BASE] not in ("ebp", "rbp", "esp", "rsp"): @@ -89,6 +92,7 @@ def _bb_has_stackstring(ws, bb): for insn in bb.instructions: if is_mov_imm_to_stack(insn): # add number of operand bytes + print(hex(insn.address)) src = insn.operands[1] count += get_printable_len(src) @@ -101,7 +105,7 @@ def _bb_has_stackstring(ws, bb): def extract_stackstring(ws, bb): """ check basic block for stackstring indicators """ if _bb_has_stackstring(ws, bb): - yield Characteristic("stack string"), bb.va + yield Characteristic("stack string"), bb.address def extract_basic_block_features(ws, bb): From bf4a8dcd3e379bededcdafbb2390c465eafb8b38 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 8 Aug 2020 13:51:50 -0600 Subject: [PATCH 03/44] setup: add dep on backports.lru --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d4a0ae60..412d26b2 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ else: requirements.append("vivisect @ https://github.com/williballenthin/vivisect/tarball/v0.0.20200804#egg=vivisect") requirements.append("viv-utils") requirements.append("networkx==2.2") # v2.2 is last version supported by Python 2.7 + requirements.append("backports.functools-lru-cache") # this sets __version__ # via: http://stackoverflow.com/a/7071358/87207 @@ -52,7 +53,7 @@ setuptools.setup( "pycodestyle", "black ; python_version>'3.0'", "isort", - ] + ], }, zip_safe=False, keywords="capa", From 393b332f9c3232cb30300dc86b5673de431b079a Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 8 Aug 2020 13:52:01 -0600 Subject: [PATCH 04/44] feature: insn: better render negative offset --- capa/features/insn.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/capa/features/insn.py b/capa/features/insn.py index bd8b4c94..fc8d01f6 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -24,7 +24,10 @@ class Number(Feature): super(Number, self).__init__(value, arch=arch, description=description) def get_value_str(self): - return "0x%X" % self.value + if self.value < 0: + return "-0x%X" % (-self.value) + else: + return "0x%X" % self.value class Offset(Feature): From 02a8ad1ea42f5c589e7f162a1dbc4cf708f7fb9a Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 8 Aug 2020 13:52:22 -0600 Subject: [PATCH 05/44] tests: add more lancelot feature tests --- capa/features/extractors/lancelot/function.py | 4 +- capa/features/extractors/viv/__init__.py | 4 - tests/test_lancelot_features.py | 119 +++++++++++++----- 3 files changed, 89 insertions(+), 38 deletions(-) diff --git a/capa/features/extractors/lancelot/function.py b/capa/features/extractors/lancelot/function.py index 04bb699a..5ed6a900 100644 --- a/capa/features/extractors/lancelot/function.py +++ b/capa/features/extractors/lancelot/function.py @@ -15,11 +15,11 @@ logger = logging.getLogger(__name__) def extract_function_switch(ws, f): - return [] + raise NotImplementedError() def extract_function_calls_to(ws, f): - return [] + raise NotImplementedError() def extract_function_loop(ws, f): diff --git a/capa/features/extractors/viv/__init__.py b/capa/features/extractors/viv/__init__.py index c6511215..76e2a2f2 100644 --- a/capa/features/extractors/viv/__init__.py +++ b/capa/features/extractors/viv/__init__.py @@ -8,11 +8,7 @@ import types -import file -import insn -import function import viv_utils -import basicblock import capa.features.extractors import capa.features.extractors.viv.file diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index f4fd69ab..deca025a 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -30,16 +30,6 @@ from capa.features import ARCH_X32, ARCH_X64 CD = os.path.dirname(__file__) -@pytest.fixture -def mimikatz(): - return os.path.join(CD, "data", "mimikatz.exe_") - - -@pytest.fixture -def kernel32(): - return os.path.join(CD, "data", "kernel32.dll_") - - @lru_cache def extract_file_features(extractor): features = set([]) @@ -49,26 +39,26 @@ def extract_file_features(extractor): @lru_cache -def extract_function_features(f): +def extract_function_features(extractor, f): features = collections.defaultdict(set) - for bb in f.basic_blocks: - for insn in bb.instructions: - for feature, va in capa.features.extractors.lancelot.insn.extract_features(f, bb, insn): + for bb in extractor.get_basic_blocks(f): + for insn in extractor.get_instructions(f, bb): + for feature, va in extractor.extract_insn_features(f, bb, insn): features[feature].add(va) - for feature, va in capa.features.extractors.lancelot.basicblock.extract_features(f, bb): + for feature, va in extractor.extract_basic_block_features(f, bb): features[feature].add(va) - for feature, va in capa.features.extractors.lancelot.function.extract_features(f): + for feature, va in extractor.extract_function_features(f): features[feature].add(va) return features @lru_cache -def extract_basic_block_features(f, bb): +def extract_basic_block_features(extractor, f, bb): features = set({}) - for insn in bb.instructions: - for feature, _ in capa.features.extractors.lancelot.insn.extract_features(f, bb, insn): + for insn in extractor.get_instructions(f, bb): + for feature, _ in extractor.extract_insn_features(f, bb, insn): features.add(feature) - for feature, _ in capa.features.extractors.lancelot.basicblock.extract_features(f, bb): + for feature, _ in extractor.extract_basic_block_features(f, bb): features.add(feature) return features @@ -87,14 +77,51 @@ def sample(request): return os.path.join(CD, "data", "mimikatz.exe_") elif request.param == "kernel32": return os.path.join(CD, "data", "kernel32.dll_") + elif request.param == "pma12-04": + return os.path.join(CD, "data", "Practical Malware Analysis Lab 12-04.exe_") else: raise ValueError("unexpected sample fixture") +def get_function(extractor, fva): + for f in extractor.get_functions(): + if f.__int__() == fva: + return f + raise ValueError("function not found") + + +def get_basic_block(extractor, f, va): + for bb in extractor.get_basic_blocks(f): + if bb.__int__() == va: + return bb + raise ValueError("basic block not found") + + @pytest.fixture def scope(request): if request.param == "file": return extract_file_features + elif "bb=" in request.param: + # like `function=0x401000,bb=0x40100A` + fspec, _, bbspec = request.param.partition(",") + fva = int(fspec.partition("=")[2], 0x10) + bbva = int(bbspec.partition("=")[2], 0x10) + + def inner(extractor): + f = get_function(extractor, fva) + bb = get_basic_block(extractor, f, bbva) + return extract_basic_block_features(extractor, f, bb) + + return inner + elif request.param.startswith("function"): + # like `function=0x401000` + va = int(request.param.partition("=")[2], 0x10) + + def inner(extractor): + f = get_function(extractor, va) + return extract_function_features(extractor, f) + + return inner else: raise ValueError("unexpected scope fixture") @@ -102,15 +129,22 @@ def scope(request): @pytest.mark.parametrize( "sample,scope,feature,expected", [ - # sections + # file/characteristic("embedded pe") + ("pma12-04", "file", capa.features.Characteristic("embedded pe"), True), + # file/string + ("mimikatz", "file", capa.features.String("SCardControl"), True), + ("mimikatz", "file", capa.features.String("SCardTransmit"), True), + ("mimikatz", "file", capa.features.String("ACR > "), True), + ("mimikatz", "file", capa.features.String("nope"), False), + # file/sections ("mimikatz", "file", capa.features.file.Section(".rsrc"), True), ("mimikatz", "file", capa.features.file.Section(".text"), True), ("mimikatz", "file", capa.features.file.Section(".nope"), False), - # exports + # file/exports ("kernel32", "file", capa.features.file.Export("BaseThreadInitThunk"), True), ("kernel32", "file", capa.features.file.Export("lstrlenW"), True), ("kernel32", "file", capa.features.file.Export("nope"), False), - # imports + # file/imports ("mimikatz", "file", capa.features.file.Import("advapi32.CryptSetHashParam"), True), ("mimikatz", "file", capa.features.file.Import("CryptSetHashParam"), True), ("mimikatz", "file", capa.features.file.Import("kernel32.IsWow64Process"), True), @@ -119,10 +153,39 @@ def scope(request): ("mimikatz", "file", capa.features.file.Import("#11"), False), ("mimikatz", "file", capa.features.file.Import("#nope"), False), ("mimikatz", "file", capa.features.file.Import("nope"), False), + # function/characteristic(loop) + ("mimikatz", "function=0x401517", capa.features.Characteristic("loop"), True), + ("mimikatz", "function=0x401000", capa.features.Characteristic("loop"), False), + # function/characteristic(switch) + pytest.param( + "mimikatz", + "function=0x409411", + capa.features.Characteristic("switch"), + True, + marks=pytest.mark.xfail(reason="characteristic(switch) not implemented yet"), + ), + ("mimikatz", "function=0x401000", capa.features.Characteristic("switch"), False), + # function/characteristic(calls to) + pytest.param( + "mimikatz", + "function=0x401000", + capa.features.Characteristic("calls to"), + True, + marks=pytest.mark.xfail(reason="characteristic(calls to) not implemented yet"), + ), + # function/characteristic(tight loop) + ("mimikatz", "function=0x402EC4", capa.features.Characteristic("tight loop"), True), + ("mimikatz", "function=0x401000", capa.features.Characteristic("tight loop"), False), + # function/characteristic(stack string) + ("mimikatz", "function=0x4556E5", capa.features.Characteristic("stack string"), True), + ("mimikatz", "function=0x401000", capa.features.Characteristic("stack string"), False), + # bb/characteristic(tight loop) + ("mimikatz", "function=0x402EC4,bb=0x402F8E", capa.features.Characteristic("tight loop"), True), + ("mimikatz", "function=0x401000,bb=0x401000", capa.features.Characteristic("tight loop"), False), ], indirect=["sample", "scope"], ) -def test_file_section_features(sample, scope, feature, expected): +def test_lancelot_features(sample, scope, feature, expected): extractor = get_lancelot_extractor(sample) features = scope(extractor) assert (feature in features) == expected @@ -313,14 +376,6 @@ def test_thunk_features(sample_9324d1a8ae37a36ae560c37448c9705a): assert capa.features.insn.API("CreateToolhelp32Snapshot") in features -def test_file_embedded_pe(pma_lab_12_04): - features = extract_file_features(pma_lab_12_04.ws, pma_lab_12_04.path) - assert capa.features.Characteristic("embedded pe") in features - - -def test_stackstring_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x4556E5)) - assert capa.features.Characteristic("stack string") in features def test_switch_features(mimikatz): From 4e6b475ff6d89c015be08c56b618fda216efc7fd Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 8 Aug 2020 13:55:52 -0600 Subject: [PATCH 06/44] tests: lancelot: add number tests --- tests/test_lancelot_features.py | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index deca025a..f066dcd0 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -173,15 +173,25 @@ def scope(request): True, marks=pytest.mark.xfail(reason="characteristic(calls to) not implemented yet"), ), - # function/characteristic(tight loop) + # bb/characteristic(tight loop) ("mimikatz", "function=0x402EC4", capa.features.Characteristic("tight loop"), True), ("mimikatz", "function=0x401000", capa.features.Characteristic("tight loop"), False), - # function/characteristic(stack string) + # bb/characteristic(stack string) ("mimikatz", "function=0x4556E5", capa.features.Characteristic("stack string"), True), ("mimikatz", "function=0x401000", capa.features.Characteristic("stack string"), False), # bb/characteristic(tight loop) ("mimikatz", "function=0x402EC4,bb=0x402F8E", capa.features.Characteristic("tight loop"), True), ("mimikatz", "function=0x401000,bb=0x401000", capa.features.Characteristic("tight loop"), False), + # insn/number + ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF), True), + ("mimikatz", "function=0x40105D", capa.features.insn.Number(0x3136B0), True), + # insn/number: stack adjustments + ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xC), False), + ("mimikatz", "function=0x40105D", capa.features.insn.Number(0x10), False), + # insn/number: arch flavors + ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF), True), + ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF, arch=ARCH_X32), True), + ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF, arch=ARCH_X64), False), ], indirect=["sample", "scope"], ) @@ -252,22 +262,6 @@ def test_bytes_pointer_features(mimikatz): assert capa.features.Bytes("INPUTEVENT".encode("utf-16le")).evaluate(features) == True -def test_number_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) - assert capa.features.insn.Number(0xFF) in features - assert capa.features.insn.Number(0x3136B0) in features - # the following are stack adjustments - assert capa.features.insn.Number(0xC) not in features - assert capa.features.insn.Number(0x10) not in features - - -def test_number_arch_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) - assert capa.features.insn.Number(0xFF) in features - assert capa.features.insn.Number(0xFF, arch=ARCH_X32) in features - assert capa.features.insn.Number(0xFF, arch=ARCH_X64) not in features - - def test_offset_features(mimikatz): features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) assert capa.features.insn.Offset(0x0) in features From fcb8c4a2938b51691f0693aee78fd02d6d69c4a5 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sun, 9 Aug 2020 15:46:34 -0600 Subject: [PATCH 07/44] tests: lancelot: override parametrize for better naming --- tests/test_lancelot_features.py | 43 +++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index f066dcd0..c5ddc6f0 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -126,7 +126,21 @@ def scope(request): raise ValueError("unexpected scope fixture") -@pytest.mark.parametrize( +def parametrize(params, values, **kwargs): + """ + extend `pytest.mark.parametrize` to pretty-print features. + by default, it renders objects as an opaque value. + ref: https://docs.pytest.org/en/2.9.0/example/parametrize.html#different-options-for-test-ids + + rendered ID might look something like: + + mimikatz-function=0x403BAC-api(CryptDestroyKey)-True + """ + ids = ["-".join(map(str, vs)) for vs in values] + return pytest.mark.parametrize(params, values, ids=ids, **kwargs) + + +@parametrize( "sample,scope,feature,expected", [ # file/characteristic("embedded pe") @@ -192,6 +206,19 @@ def scope(request): ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF), True), ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF, arch=ARCH_X32), True), ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF, arch=ARCH_X64), False), + # insn/api + ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContextW"), True), + ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContext"), True), + ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptGenKey"), True), + ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptImportKey"), True), + ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptDestroyKey"), True), + ("mimikatz", "function=0x403BAC", capa.features.insn.API("CryptAcquireContextW"), True), + ("mimikatz", "function=0x403BAC", capa.features.insn.API("CryptAcquireContext"), True), + ("mimikatz", "function=0x403BAC", capa.features.insn.API("CryptGenKey"), True), + ("mimikatz", "function=0x403BAC", capa.features.insn.API("CryptImportKey"), True), + ("mimikatz", "function=0x403BAC", capa.features.insn.API("CryptDestroyKey"), True), + ("mimikatz", "function=0x403BAC", capa.features.insn.API("Nope"), False), + ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.Nope"), False), ], indirect=["sample", "scope"], ) @@ -202,20 +229,6 @@ def test_lancelot_features(sample, scope, feature, expected): """ -def test_api_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x403BAC)) - assert capa.features.insn.API("advapi32.CryptAcquireContextW") in features - assert capa.features.insn.API("advapi32.CryptAcquireContext") in features - assert capa.features.insn.API("advapi32.CryptGenKey") in features - assert capa.features.insn.API("advapi32.CryptImportKey") in features - assert capa.features.insn.API("advapi32.CryptDestroyKey") in features - assert capa.features.insn.API("CryptAcquireContextW") in features - assert capa.features.insn.API("CryptAcquireContext") in features - assert capa.features.insn.API("CryptGenKey") in features - assert capa.features.insn.API("CryptImportKey") in features - assert capa.features.insn.API("CryptDestroyKey") in features - - def test_api_features_64_bit(sample_a198216798ca38f280dc413f8c57f2c2): features = extract_function_features(lancelot_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.ws, 0x4011B0)) assert capa.features.insn.API("kernel32.GetStringTypeA") in features From cdae840519ba8b16df3aa98a668550be725823eb Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 11:49:11 -0600 Subject: [PATCH 08/44] lancelot: file: fix import address --- capa/features/extractors/lancelot/basicblock.py | 1 - capa/features/extractors/lancelot/file.py | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/capa/features/extractors/lancelot/basicblock.py b/capa/features/extractors/lancelot/basicblock.py index b60ea86f..bad58ff4 100644 --- a/capa/features/extractors/lancelot/basicblock.py +++ b/capa/features/extractors/lancelot/basicblock.py @@ -92,7 +92,6 @@ def _bb_has_stackstring(ws, bb): for insn in bb.instructions: if is_mov_imm_to_stack(insn): # add number of operand bytes - print(hex(insn.address)) src = insn.operands[1] count += get_printable_len(src) diff --git a/capa/features/extractors/lancelot/file.py b/capa/features/extractors/lancelot/file.py index 8196e886..913b69fc 100644 --- a/capa/features/extractors/lancelot/file.py +++ b/capa/features/extractors/lancelot/file.py @@ -43,13 +43,12 @@ def extract_file_import_names(buf, pe): for entry in pe.DIRECTORY_ENTRY_IMPORT: libname = entry.dll.decode("ascii").lower().partition(".")[0] for imp in entry.imports: - impaddr = base_address + imp.address if imp.ordinal: - yield Import("%s.#%s" % (libname, imp.ordinal)), impaddr + yield Import("%s.#%s" % (libname, imp.ordinal)), imp.address else: impname = imp.name.decode("ascii") - yield Import("%s.%s" % (libname, impname)), impaddr - yield Import("%s" % (impname)), impaddr + yield Import("%s.%s" % (libname, impname)), imp.address + yield Import("%s" % (impname)), imp.address def extract_file_section_names(buf, pe): From 5eded3c5cc6298cbb57fab6e4c6e2dcf3acf50de Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 11:49:37 -0600 Subject: [PATCH 09/44] lancelot: insn: implement API features --- capa/features/extractors/lancelot/__init__.py | 3 +- capa/features/extractors/lancelot/insn.py | 139 +++++++++++++++--- tests/test_lancelot_features.py | 22 ++- 3 files changed, 143 insertions(+), 21 deletions(-) diff --git a/capa/features/extractors/lancelot/__init__.py b/capa/features/extractors/lancelot/__init__.py index 8be8c054..98f7cdc4 100644 --- a/capa/features/extractors/lancelot/__init__.py +++ b/capa/features/extractors/lancelot/__init__.py @@ -52,6 +52,7 @@ class LancelotFeatureExtractor(capa.features.extractors.FeatureExtractor): super(LancelotFeatureExtractor, self).__init__() self.buf = buf self.ws = lancelot.from_bytes(buf) + self.ctx = {} def get_base_address(self): return self.ws.base_address @@ -86,5 +87,5 @@ class LancelotFeatureExtractor(capa.features.extractors.FeatureExtractor): return bb.instructions def extract_insn_features(self, f, bb, insn): - for feature, va in capa.features.extractors.lancelot.insn.extract_insn_features(self.ws, insn): + for feature, va in capa.features.extractors.lancelot.insn.extract_insn_features(self, insn): yield feature, va diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 6028d453..7a60c8f5 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -1,14 +1,26 @@ import logging +import pefile + +try: + from functools import lru_cache +except ImportError: + from backports.functools_lru_cache import lru_cache + from lancelot import ( OPERAND_TYPE, PERMISSION_READ, + MEMORY_OPERAND_BASE, + MEMORY_OPERAND_DISP, + OPERAND_TYPE_MEMORY, OPERAND_TYPE_REGISTER, OPERAND_TYPE_IMMEDIATE, IMMEDIATE_OPERAND_VALUE, REGISTER_OPERAND_REGISTER, + IMMEDIATE_OPERAND_IS_RELATIVE, ) +import capa.features.extractors.helpers from capa.features import ARCH_X32, ARCH_X64 from capa.features.insn import Number @@ -29,17 +41,106 @@ def get_arch(ws): raise ValueError("unexpected architecture") -def get_imports(ws): - """caching accessor""" - raise NotImplementedError() +@lru_cache +def get_imports(xtor): + pe = pefile.PE(data=xtor.buf) + + imports = {} + for entry in pe.DIRECTORY_ENTRY_IMPORT: + libname = entry.dll.decode("ascii").lower().partition(".")[0] + for imp in entry.imports: + if imp.ordinal: + imports[imp.address] = "%s.#%s" % (libname, imp.ordinal) + else: + impname = imp.name.decode("ascii") + imports[imp.address] = "%s.%s" % (libname, impname) + return imports -def extract_insn_api_features(ws, insn): +@lru_cache +def get_thunks(xtor): + thunks = {} + for va in xtor.ws.get_functions(): + try: + insn = xtor.ws.read_insn(va) + except ValueError: + continue + + if insn.mnemonic != "jmp": + continue + + op0 = insn.operands[0] + + if op0[OPERAND_TYPE] == OPERAND_TYPE_MEMORY: + target = op0[MEMORY_OPERAND_DISP] + + # direct, x64, rip relative + # 180020570 FF 25 DA 83 05 00 jmp cs:RtlCaptureContext_0 + if op0[MEMORY_OPERAND_BASE] == "rip": + target = op0[MEMORY_OPERAND_DISP] + insn.address + insn.length + + # direct, x32 + # mimikatz:.text:0046AE12 FF 25 54 30 47 00 jmp ds:__imp_LsaQueryInformationPolicy + elif op0[MEMORY_OPERAND_BASE] == None: + target = op0[MEMORY_OPERAND_DISP] + + else: + continue + + imports = get_imports(xtor) + if target not in imports: + continue + + thunks[va] = imports[target] + continue + + return thunks + + +def extract_insn_api_features(xtor, insn): """parse API features from the given instruction.""" + + if insn.mnemonic != "call": + return + + op0 = insn.operands[0] + + if op0[OPERAND_TYPE] == OPERAND_TYPE_MEMORY: + + # call direct, x64 + # rip relative + # kernel32-64:180001041 call cs:__imp_RtlVirtualUnwind_0 + if op0[MEMORY_OPERAND_BASE] == "rip": + target = op0[MEMORY_OPERAND_DISP] + insn.address + insn.length + + # call direct, x32 + # mimikatz:0x403BD3 call ds:CryptAcquireContextW + elif op0[MEMORY_OPERAND_BASE] == None: + target = op0[MEMORY_OPERAND_DISP] + + else: + return + + imports = get_imports(xtor) + if target in imports: + for feature, va in capa.features.extractors.helpers.generate_api_features(imports[target], insn.address): + yield feature, va + + # call via thunk + # mimikatz:0x455A41 call LsaQueryInformationPolicy + elif op0[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE and op0[IMMEDIATE_OPERAND_IS_RELATIVE]: + target = op0[IMMEDIATE_OPERAND_VALUE] + insn.address + insn.length + thunks = get_thunks(xtor) + if target in thunks: + for feature, va in capa.features.extractors.helpers.generate_api_features(thunks[target], insn.address): + yield feature, va + + # call on x64 + raise NotImplementedError() -def extract_insn_number_features(ws, insn): +def extract_insn_number_features(xtor, insn): """parse number features from the given instruction.""" operands = insn.operands @@ -49,7 +150,7 @@ def extract_insn_number_features(ws, insn): v = oper[IMMEDIATE_OPERAND_VALUE] - if ws.probe(v) & PERMISSION_READ: + if xtor.ws.probe(v) & PERMISSION_READ: # v is a valid address # therefore, assume its not also a constant. continue @@ -66,7 +167,7 @@ def extract_insn_number_features(ws, insn): return yield Number(v), insn.address - yield Number(v, arch=get_arch(ws)), insn.address + yield Number(v, arch=get_arch(xtor.ws)), insn.address def derefs(ws, p): @@ -85,7 +186,7 @@ def read_bytes(ws, va): raise NotImplementedError() -def extract_insn_bytes_features(ws, insn): +def extract_insn_bytes_features(xtor, insn): """ parse byte sequence features from the given instruction. """ @@ -96,12 +197,12 @@ def read_string(ws, va): raise NotImplementedError() -def extract_insn_string_features(ws, insn): +def extract_insn_string_features(xtor, insn): """parse string features from the given instruction.""" raise NotImplementedError() -def extract_insn_offset_features(ws, insn): +def extract_insn_offset_features(xtor, insn): """parse structure offset features from the given instruction.""" raise NotImplementedError() @@ -113,7 +214,7 @@ def is_security_cookie(ws, insn): raise NotImplementedError() -def extract_insn_nzxor_characteristic_features(ws, insn): +def extract_insn_nzxor_characteristic_features(xtor, insn): """ parse non-zeroing XOR instruction from the given instruction. ignore expected non-zeroing XORs, e.g. security cookies. @@ -121,24 +222,24 @@ def extract_insn_nzxor_characteristic_features(ws, insn): raise NotImplementedError() -def extract_insn_mnemonic_features(ws, insn): +def extract_insn_mnemonic_features(xtor, insn): """parse mnemonic features from the given instruction.""" raise NotImplementedError() -def extract_insn_peb_access_characteristic_features(ws, insn): +def extract_insn_peb_access_characteristic_features(xtor, insn): """ parse peb access from the given function. fs:[0x30] on x86, gs:[0x60] on x64 """ raise NotImplementedError() -def extract_insn_segment_access_features(ws, insn): +def extract_insn_segment_access_features(xtor, insn): """ parse the instruction for access to fs or gs """ raise NotImplementedError() -def extract_insn_cross_section_cflow(ws, insn): +def extract_insn_cross_section_cflow(xtor, insn): """ inspect the instruction for a CALL or JMP that crosses section boundaries. """ @@ -147,13 +248,13 @@ def extract_insn_cross_section_cflow(ws, insn): # this is a feature that's most relevant at the function scope, # however, its most efficient to extract at the instruction scope. -def extract_function_calls_from(ws, insn): +def extract_function_calls_from(xtor, insn): raise NotImplementedError() # this is a feature that's most relevant at the function or basic block scope, # however, its most efficient to extract at the instruction scope. -def extract_function_indirect_call_characteristic_features(ws, insn): +def extract_function_indirect_call_characteristic_features(xtor, insn): """ extract indirect function call characteristic (e.g., call eax or call dword ptr [edx+4]) does not include calls like => call ds:dword_ABD4974 @@ -164,10 +265,10 @@ def extract_function_indirect_call_characteristic_features(ws, insn): _not_implemented = set([]) -def extract_insn_features(ws, insn): +def extract_insn_features(xtor, insn): for insn_handler in INSTRUCTION_HANDLERS: try: - for feature, va in insn_handler(ws, insn): + for feature, va in insn_handler(xtor, insn): yield feature, va except NotImplementedError: if insn_handler.__name__ not in _not_implemented: diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index c5ddc6f0..d819bdd9 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -12,7 +12,6 @@ import collections try: from functools import lru_cache except ImportError: - # pip install backports.functools-lru-cache from backports.functools_lru_cache import lru_cache import pytest @@ -77,6 +76,8 @@ def sample(request): return os.path.join(CD, "data", "mimikatz.exe_") elif request.param == "kernel32": return os.path.join(CD, "data", "kernel32.dll_") + elif request.param == "kernel32-64": + return os.path.join(CD, "data", "kernel32-64.dll_") elif request.param == "pma12-04": return os.path.join(CD, "data", "Practical Malware Analysis Lab 12-04.exe_") else: @@ -219,6 +220,25 @@ def parametrize(params, values, **kwargs): ("mimikatz", "function=0x403BAC", capa.features.insn.API("CryptDestroyKey"), True), ("mimikatz", "function=0x403BAC", capa.features.insn.API("Nope"), False), ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.Nope"), False), + # insn/api: thunk + ("mimikatz", "function=0x4556E5", capa.features.insn.API("advapi32.LsaQueryInformationPolicy"), True), + ("mimikatz", "function=0x4556E5", capa.features.insn.API("LsaQueryInformationPolicy"), True), + # insn/api: x64 + ( + "kernel32-64", + "function=0x180001010", + capa.features.insn.API("api-ms-win-core-rtlsupport-l1-1-0.RtlVirtualUnwind"), + True, + ), + ("kernel32-64", "function=0x180001010", capa.features.insn.API("RtlVirtualUnwind"), True), + # insn/api: x64 thunk + ( + "kernel32-64", + "function=0x1800202B0", + capa.features.insn.API("api-ms-win-core-rtlsupport-l1-1-0.RtlCaptureContext"), + True, + ), + ("kernel32-64", "function=0x1800202B0", capa.features.insn.API("RtlCaptureContext"), True), ], indirect=["sample", "scope"], ) From 7996e2efe770bb2c55f7e83d371728f3163e3788 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 11:51:48 -0600 Subject: [PATCH 10/44] tests: lancelot: remove old tests --- tests/test_lancelot_features.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index d819bdd9..aa1ef18e 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -249,19 +249,6 @@ def test_lancelot_features(sample, scope, feature, expected): """ -def test_api_features_64_bit(sample_a198216798ca38f280dc413f8c57f2c2): - features = extract_function_features(lancelot_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.ws, 0x4011B0)) - assert capa.features.insn.API("kernel32.GetStringTypeA") in features - assert capa.features.insn.API("kernel32.GetStringTypeW") not in features - assert capa.features.insn.API("kernel32.GetStringType") in features - assert capa.features.insn.API("GetStringTypeA") in features - assert capa.features.insn.API("GetStringType") in features - # call via thunk in IDA Pro - features = extract_function_features(lancelot_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.ws, 0x401CB0)) - assert capa.features.insn.API("msvcrt.vfprintf") in features - assert capa.features.insn.API("vfprintf") in features - - def test_string_features(mimikatz): features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) assert capa.features.String("SCardControl") in features @@ -397,14 +384,6 @@ def test_segment_access_features(sample_a933a1a402775cfa94b6bee0963f4b46): assert capa.features.Characteristic("fs access") in features -def test_thunk_features(sample_9324d1a8ae37a36ae560c37448c9705a): - features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x407970)) - assert capa.features.insn.API("kernel32.CreateToolhelp32Snapshot") in features - assert capa.features.insn.API("CreateToolhelp32Snapshot") in features - - - - def test_switch_features(mimikatz): features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x409411)) assert capa.features.Characteristic("switch") in features @@ -437,14 +416,6 @@ def test_loop_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f534 assert capa.features.Characteristic("loop") not in features -def test_file_string_features(sample_bfb9b5391a13d0afd787e87ab90f14f5): - features = extract_file_features( - sample_bfb9b5391a13d0afd787e87ab90f14f5.ws, sample_bfb9b5391a13d0afd787e87ab90f14f5.path, - ) - assert capa.features.String("WarStop") in features # ASCII, offset 0x40EC - assert capa.features.String("cimage/png") in features # UTF-16 LE, offset 0x350E - - def test_function_calls_to(sample_9324d1a8ae37a36ae560c37448c9705a): features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x406F60)) assert capa.features.Characteristic("calls to") in features From 1da450001c6c0533d88b6757039354fd0c69c9ff Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 13:47:43 -0600 Subject: [PATCH 11/44] lancelot: insn: offset --- capa/features/extractors/lancelot/insn.py | 25 ++++++++++++---- tests/test_lancelot_features.py | 36 +++++++++-------------- 2 files changed, 33 insertions(+), 28 deletions(-) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 7a60c8f5..565dd4f4 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -22,7 +22,7 @@ from lancelot import ( import capa.features.extractors.helpers from capa.features import ARCH_X32, ARCH_X64 -from capa.features.insn import Number +from capa.features.insn import Number, Offset logger = logging.getLogger(__name__) @@ -170,6 +170,24 @@ def extract_insn_number_features(xtor, insn): yield Number(v, arch=get_arch(xtor.ws)), insn.address +def extract_insn_offset_features(xtor, insn): + """parse structure offset features from the given instruction.""" + operands = insn.operands + + for oper in operands: + if oper[OPERAND_TYPE] != OPERAND_TYPE_MEMORY: + continue + + if oper[MEMORY_OPERAND_BASE] in ("esp", "ebp", "rbp"): + continue + + # lancelot provides `None` when the displacement is not present. + v = oper[MEMORY_OPERAND_DISP] or 0 + + yield Offset(v), insn.address + yield Offset(v, arch=get_arch(xtor.ws)), insn.address + + def derefs(ws, p): """ recursively follow the given pointer, yielding the valid memory addresses along the way. @@ -202,11 +220,6 @@ def extract_insn_string_features(xtor, insn): raise NotImplementedError() -def extract_insn_offset_features(xtor, insn): - """parse structure offset features from the given instruction.""" - raise NotImplementedError() - - def is_security_cookie(ws, insn): """ check if an instruction is related to security cookie checks diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index aa1ef18e..0d88af34 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -207,6 +207,20 @@ def parametrize(params, values, **kwargs): ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF), True), ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF, arch=ARCH_X32), True), ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF, arch=ARCH_X64), False), + # insn/offset + ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x0), True), + ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x4), True), + ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0xC), True), + # insn/offset: stack references + ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x8), False), + ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x10), False), + # insn/offset: negative + ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x1), True), + ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x2), True), + # insn/offset: arch flavors + ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x0), True), + ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x0, arch=ARCH_X32), True), + ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x0, arch=ARCH_X64), False), # insn/api ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContextW"), True), ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContext"), True), @@ -282,28 +296,6 @@ def test_bytes_pointer_features(mimikatz): assert capa.features.Bytes("INPUTEVENT".encode("utf-16le")).evaluate(features) == True -def test_offset_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) - assert capa.features.insn.Offset(0x0) in features - assert capa.features.insn.Offset(0x4) in features - assert capa.features.insn.Offset(0xC) in features - # the following are stack references - assert capa.features.insn.Offset(0x8) not in features - assert capa.features.insn.Offset(0x10) not in features - - # this function has the following negative offsets - # movzx ecx, byte ptr [eax-1] - # movzx eax, byte ptr [eax-2] - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x4011FB)) - assert capa.features.insn.Offset(-0x1) in features - assert capa.features.insn.Offset(-0x2) in features - - -def test_offset_arch_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) - assert capa.features.insn.Offset(0x0) in features - assert capa.features.insn.Offset(0x0, arch=ARCH_X32) in features - assert capa.features.insn.Offset(0x0, arch=ARCH_X64) not in features def test_nzxor_features(mimikatz): From 042654ee979e220048c4b1eb5c7f77952d9a55bf Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 13:50:46 -0600 Subject: [PATCH 12/44] lancelot: insn: mnemonic --- capa/features/extractors/lancelot/insn.py | 12 ++++++------ tests/test_lancelot_features.py | 14 ++++++-------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 565dd4f4..85877bd4 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -22,7 +22,7 @@ from lancelot import ( import capa.features.extractors.helpers from capa.features import ARCH_X32, ARCH_X64 -from capa.features.insn import Number, Offset +from capa.features.insn import Number, Offset, Mnemonic logger = logging.getLogger(__name__) @@ -140,6 +140,11 @@ def extract_insn_api_features(xtor, insn): raise NotImplementedError() +def extract_insn_mnemonic_features(xtor, insn): + """parse mnemonic features from the given instruction.""" + yield Mnemonic(insn.mnemonic), insn.address + + def extract_insn_number_features(xtor, insn): """parse number features from the given instruction.""" operands = insn.operands @@ -235,11 +240,6 @@ def extract_insn_nzxor_characteristic_features(xtor, insn): raise NotImplementedError() -def extract_insn_mnemonic_features(xtor, insn): - """parse mnemonic features from the given instruction.""" - raise NotImplementedError() - - def extract_insn_peb_access_characteristic_features(xtor, insn): """ parse peb access from the given function. fs:[0x30] on x86, gs:[0x60] on x64 diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index 0d88af34..6261b23a 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -197,6 +197,12 @@ def parametrize(params, values, **kwargs): # bb/characteristic(tight loop) ("mimikatz", "function=0x402EC4,bb=0x402F8E", capa.features.Characteristic("tight loop"), True), ("mimikatz", "function=0x401000,bb=0x401000", capa.features.Characteristic("tight loop"), False), + # insn/mnemonic + ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("push"), True), + ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("movzx"), True), + ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("xor"), True), + ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("in"), False), + ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("out"), False), # insn/number ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF), True), ("mimikatz", "function=0x40105D", capa.features.insn.Number(0x3136B0), True), @@ -326,14 +332,6 @@ def test_is_security_cookie(mimikatz): assert capa.features.extractors.lancelot.insn.is_security_cookie(f, bb, insn) == True -def test_mnemonic_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) - assert capa.features.insn.Mnemonic("push") in features - assert capa.features.insn.Mnemonic("movzx") in features - assert capa.features.insn.Mnemonic("xor") in features - - assert capa.features.insn.Mnemonic("in") not in features - assert capa.features.insn.Mnemonic("out") not in features def test_peb_access_features(sample_a933a1a402775cfa94b6bee0963f4b46): From 10f5a54e1dca8e0d0812e71493b46204ab4d0480 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 17:08:28 -0600 Subject: [PATCH 13/44] lancelot: insn: bytes --- capa/features/extractors/lancelot/insn.py | 128 +++++++++++++++++++--- tests/test_lancelot_features.py | 43 ++++---- 2 files changed, 132 insertions(+), 39 deletions(-) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 85877bd4..654eb9b2 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -21,7 +21,7 @@ from lancelot import ( ) import capa.features.extractors.helpers -from capa.features import ARCH_X32, ARCH_X64 +from capa.features import ARCH_X32, ARCH_X64, MAX_BYTES_FEATURE_SIZE, Bytes, String from capa.features.insn import Number, Offset, Mnemonic logger = logging.getLogger(__name__) @@ -41,9 +41,14 @@ def get_arch(ws): raise ValueError("unexpected architecture") +@lru_cache +def get_pefile(xtor): + return pefile.PE(data=xtor.buf) + + @lru_cache def get_imports(xtor): - pe = pefile.PE(data=xtor.buf) + pe = get_pefile(xtor) imports = {} for entry in pe.DIRECTORY_ENTRY_IMPORT: @@ -135,10 +140,6 @@ def extract_insn_api_features(xtor, insn): for feature, va in capa.features.extractors.helpers.generate_api_features(thunks[target], insn.address): yield feature, va - # call on x64 - - raise NotImplementedError() - def extract_insn_mnemonic_features(xtor, insn): """parse mnemonic features from the given instruction.""" @@ -149,11 +150,11 @@ def extract_insn_number_features(xtor, insn): """parse number features from the given instruction.""" operands = insn.operands - for oper in operands: - if oper[OPERAND_TYPE] != OPERAND_TYPE_IMMEDIATE: + for operand in operands: + if operand[OPERAND_TYPE] != OPERAND_TYPE_IMMEDIATE: continue - v = oper[IMMEDIATE_OPERAND_VALUE] + v = operand[IMMEDIATE_OPERAND_VALUE] if xtor.ws.probe(v) & PERMISSION_READ: # v is a valid address @@ -179,41 +180,132 @@ def extract_insn_offset_features(xtor, insn): """parse structure offset features from the given instruction.""" operands = insn.operands - for oper in operands: - if oper[OPERAND_TYPE] != OPERAND_TYPE_MEMORY: + for operand in operands: + if operand[OPERAND_TYPE] != OPERAND_TYPE_MEMORY: continue - if oper[MEMORY_OPERAND_BASE] in ("esp", "ebp", "rbp"): + if operand[MEMORY_OPERAND_BASE] in ("esp", "ebp", "rbp"): continue # lancelot provides `None` when the displacement is not present. - v = oper[MEMORY_OPERAND_DISP] or 0 + v = operand[MEMORY_OPERAND_DISP] or 0 yield Offset(v), insn.address yield Offset(v, arch=get_arch(xtor.ws)), insn.address -def derefs(ws, p): +def derefs(xtor, p): """ recursively follow the given pointer, yielding the valid memory addresses along the way. useful when you may have a pointer to string, or pointer to pointer to string, etc. this is a "do what i mean" type of helper function. """ - raise NotImplementedError() + + depth = 0 + while True: + if not xtor.ws.probe(p) & PERMISSION_READ: + return + yield p + + next = xtor.ws.read_pointer(p) + + # sanity: pointer points to self + if next == p: + return + + # sanity: avoid chains of pointers that are unreasonably deep + depth += 1 + if depth > 10: + return + + p = next -def read_bytes(ws, va): +def get_operand_target(insn, op): + if op[OPERAND_TYPE] == OPERAND_TYPE_MEMORY: + # call direct, x64 + # rip relative + # kernel32-64:180001041 call cs:__imp_RtlVirtualUnwind_0 + if op[MEMORY_OPERAND_BASE] == "rip": + return op[MEMORY_OPERAND_DISP] + insn.address + insn.length + + # call direct, x32 + # mimikatz:0x403BD3 call ds:CryptAcquireContextW + elif op[MEMORY_OPERAND_BASE] == None: + return op[MEMORY_OPERAND_DISP] + + # call via thunk + # mimikatz:0x455A41 call LsaQueryInformationPolicy + elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE and op[IMMEDIATE_OPERAND_IS_RELATIVE]: + return op[IMMEDIATE_OPERAND_VALUE] + insn.address + insn.length + + elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE: + return op[IMMEDIATE_OPERAND_VALUE] + + raise ValueError("memory operand has no target") + + +def read_bytes(xtor, va): """ read up to MAX_BYTES_FEATURE_SIZE from the given address. + + raises: + ValueError: if the given address is not valid. """ - raise NotImplementedError() + start = va + end = va + MAX_BYTES_FEATURE_SIZE + pe = get_pefile(xtor) + + for section in pe.sections: + section_start = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress + section_end = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress + section.Misc_VirtualSize + + if section_start <= start < section_end: + end = min(end, section_end) + return xtor.ws.read_bytes(start, end - start) + + raise ValueError("invalid address") def extract_insn_bytes_features(xtor, insn): """ parse byte sequence features from the given instruction. """ - raise NotImplementedError() + if insn.mnemonic == "call": + return + + if insn.address == 0x401089: + print(insn) + print(insn.operands) + + for operand in insn.operands: + try: + target = get_operand_target(insn, operand) + except ValueError: + continue + + for ptr in derefs(xtor, target): + if insn.address == 0x401089: + print(hex(ptr)) + + try: + buf = read_bytes(xtor, ptr) + except ValueError: + if insn.address == 0x401089: + print("err") + continue + + if capa.features.extractors.helpers.all_zeros(buf): + if insn.address == 0x401089: + print("zeros") + continue + + if insn.address == 0x401089: + import hexdump + + hexdump.hexdump(buf) + + yield Bytes(buf), insn.address def read_string(ws, va): diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index 6261b23a..ff0d8341 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -31,9 +31,9 @@ CD = os.path.dirname(__file__) @lru_cache def extract_file_features(extractor): - features = set([]) + features = collections.defaultdict(set) for feature, va in extractor.extract_file_features(): - features.add(feature) + features[feature].add(va) return features @@ -53,12 +53,12 @@ def extract_function_features(extractor, f): @lru_cache def extract_basic_block_features(extractor, f, bb): - features = set({}) + features = collections.defaultdict(set) for insn in extractor.get_instructions(f, bb): - for feature, _ in extractor.extract_insn_features(f, bb, insn): - features.add(feature) - for feature, _ in extractor.extract_basic_block_features(f, bb): - features.add(feature) + for feature, va in extractor.extract_insn_features(f, bb, insn): + features[feature].add(va) + for feature, va in extractor.extract_basic_block_features(f, bb): + features[feature].add(va) return features @@ -259,29 +259,30 @@ def parametrize(params, values, **kwargs): True, ), ("kernel32-64", "function=0x1800202B0", capa.features.insn.API("RtlCaptureContext"), True), + # insn/string + ("mimikatz", "function=0x40105D", capa.features.String("SCardControl"), True), + ("mimikatz", "function=0x40105D", capa.features.String("SCardTransmit"), True), + ("mimikatz", "function=0x40105D", capa.features.String("ACR > "), True), + ("mimikatz", "function=0x40105D", capa.features.String("nope"), False), + # insn/string, pointer to string + ("mimikatz", "function=0x44EDEF", capa.features.String("INPUTEVENT"), True), + # insn/bytes + ("mimikatz", "function=0x40105D", capa.features.Bytes("SCardControl".encode("utf-16le")), True), + ("mimikatz", "function=0x40105D", capa.features.Bytes("SCardTransmit".encode("utf-16le")), True), + ("mimikatz", "function=0x40105D", capa.features.Bytes("ACR > ".encode("utf-16le")), True), + ("mimikatz", "function=0x40105D", capa.features.Bytes("nope".encode("ascii")), False), + # insn/bytes, pointer to bytes + ("mimikatz", "function=0x44EDEF", capa.features.Bytes("INPUTEVENT".encode("utf-16le")), True), ], indirect=["sample", "scope"], ) def test_lancelot_features(sample, scope, feature, expected): extractor = get_lancelot_extractor(sample) features = scope(extractor) - assert (feature in features) == expected + assert feature.evaluate(features) == expected """ -def test_string_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x40105D)) - assert capa.features.String("SCardControl") in features - assert capa.features.String("SCardTransmit") in features - assert capa.features.String("ACR > ") in features - # other strings not in this function - assert capa.features.String("bcrypt.dll") not in features - - -def test_string_pointer_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x44EDEF)) - assert capa.features.String("INPUTEVENT") in features - def test_byte_features(sample_9324d1a8ae37a36ae560c37448c9705a): features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x406F60)) From 695f1bf55a3c71bd2547ac49cd774d0ca7dce90f Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 17:23:19 -0600 Subject: [PATCH 14/44] lancelot: insn: strings --- capa/features/extractors/lancelot/insn.py | 60 +++++++++++++++-------- tests/test_lancelot_features.py | 21 -------- 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 654eb9b2..fa17a998 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -1,4 +1,5 @@ import logging +import itertools import pefile @@ -271,13 +272,33 @@ def extract_insn_bytes_features(xtor, insn): """ parse byte sequence features from the given instruction. """ - if insn.mnemonic == "call": + if insn.mnemonic == ( + "call", + "jb", + "jbe", + "jcxz", + "jecxz", + "jknzd", + "jkzd", + "jl", + "jle", + "jmp", + "jnb", + "jnbe", + "jnl", + "jnle", + "jno", + "jnp", + "jns", + "jnz", + "jo", + "jp", + "jrcxz", + "js", + "jz", + ): return - if insn.address == 0x401089: - print(insn) - print(insn.operands) - for operand in insn.operands: try: target = get_operand_target(insn, operand) @@ -285,36 +306,35 @@ def extract_insn_bytes_features(xtor, insn): continue for ptr in derefs(xtor, target): - if insn.address == 0x401089: - print(hex(ptr)) - try: buf = read_bytes(xtor, ptr) except ValueError: - if insn.address == 0x401089: - print("err") continue if capa.features.extractors.helpers.all_zeros(buf): - if insn.address == 0x401089: - print("zeros") continue - if insn.address == 0x401089: - import hexdump - - hexdump.hexdump(buf) - yield Bytes(buf), insn.address -def read_string(ws, va): - raise NotImplementedError() +def first(s): + """enumerate the first element in the sequence""" + for i in s: + yield i + break def extract_insn_string_features(xtor, insn): """parse string features from the given instruction.""" - raise NotImplementedError() + for bytez, va in extract_insn_bytes_features(xtor, insn): + buf = bytez.value + + for s in itertools.chain( + first(capa.features.extractors.strings.extract_ascii_strings(buf)), + first(capa.features.extractors.strings.extract_unicode_strings(buf)), + ): + if s.offset == 0: + yield String(s.s), va def is_security_cookie(ws, insn): diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index ff0d8341..f908f119 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -284,27 +284,6 @@ def test_lancelot_features(sample, scope, feature, expected): """ -def test_byte_features(sample_9324d1a8ae37a36ae560c37448c9705a): - features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x406F60)) - wanted = capa.features.Bytes(b"\xED\x24\x9E\xF4\x52\xA9\x07\x47\x55\x8E\xE1\xAB\x30\x8E\x23\x61") - # use `==` rather than `is` because the result is not `True` but a truthy value. - assert wanted.evaluate(features) == True - - -def test_byte_features64(sample_lab21_01): - features = extract_function_features(lancelot_utils.Function(sample_lab21_01.ws, 0x1400010C0)) - wanted = capa.features.Bytes(b"\x32\xA2\xDF\x2D\x99\x2B\x00\x00") - # use `==` rather than `is` because the result is not `True` but a truthy value. - assert wanted.evaluate(features) == True - - -def test_bytes_pointer_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x44EDEF)) - assert capa.features.Bytes("INPUTEVENT".encode("utf-16le")).evaluate(features) == True - - - - def test_nzxor_features(mimikatz): features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x410DFC)) assert capa.features.Characteristic("nzxor") in features # 0x0410F0B From 3f49a224f5576387a2059f2a3b323d735c5a43cb Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 18:03:40 -0600 Subject: [PATCH 15/44] lancelot: off-by-one instruction enumerator --- capa/features/extractors/lancelot/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/lancelot/__init__.py b/capa/features/extractors/lancelot/__init__.py index 98f7cdc4..7a1a43a1 100644 --- a/capa/features/extractors/lancelot/__init__.py +++ b/capa/features/extractors/lancelot/__init__.py @@ -36,7 +36,7 @@ class BB(object): @property def instructions(self): va = self.address - while va <= self.address + self.length: + while va < self.address + self.length: try: insn = self.ws.read_insn(va) except ValueError: From 7ac4cf47f77a8fa7bf12055397cfef56afc6d9e5 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 18:04:37 -0600 Subject: [PATCH 16/44] lancelot: insn: pass f, bb, insn throughout --- capa/features/extractors/lancelot/__init__.py | 2 +- capa/features/extractors/lancelot/insn.py | 22 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/capa/features/extractors/lancelot/__init__.py b/capa/features/extractors/lancelot/__init__.py index 7a1a43a1..bda1c89b 100644 --- a/capa/features/extractors/lancelot/__init__.py +++ b/capa/features/extractors/lancelot/__init__.py @@ -87,5 +87,5 @@ class LancelotFeatureExtractor(capa.features.extractors.FeatureExtractor): return bb.instructions def extract_insn_features(self, f, bb, insn): - for feature, va in capa.features.extractors.lancelot.insn.extract_insn_features(self, insn): + for feature, va in capa.features.extractors.lancelot.insn.extract_insn_features(self, f, bb, insn): yield feature, va diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index fa17a998..a572ddf7 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -103,7 +103,7 @@ def get_thunks(xtor): return thunks -def extract_insn_api_features(xtor, insn): +def extract_insn_api_features(xtor, f, bb, insn): """parse API features from the given instruction.""" if insn.mnemonic != "call": @@ -142,12 +142,12 @@ def extract_insn_api_features(xtor, insn): yield feature, va -def extract_insn_mnemonic_features(xtor, insn): +def extract_insn_mnemonic_features(xtor, f, bb, insn): """parse mnemonic features from the given instruction.""" yield Mnemonic(insn.mnemonic), insn.address -def extract_insn_number_features(xtor, insn): +def extract_insn_number_features(xtor, f, bb, insn): """parse number features from the given instruction.""" operands = insn.operands @@ -177,7 +177,7 @@ def extract_insn_number_features(xtor, insn): yield Number(v, arch=get_arch(xtor.ws)), insn.address -def extract_insn_offset_features(xtor, insn): +def extract_insn_offset_features(xtor, f, bb, insn): """parse structure offset features from the given instruction.""" operands = insn.operands @@ -268,7 +268,7 @@ def read_bytes(xtor, va): raise ValueError("invalid address") -def extract_insn_bytes_features(xtor, insn): +def extract_insn_bytes_features(xtor, f, bb, insn): """ parse byte sequence features from the given instruction. """ @@ -324,9 +324,9 @@ def first(s): break -def extract_insn_string_features(xtor, insn): +def extract_insn_string_features(xtor, f, bb, insn): """parse string features from the given instruction.""" - for bytez, va in extract_insn_bytes_features(xtor, insn): + for bytez, va in extract_insn_bytes_features(xtor, f, bb, insn): buf = bytez.value for s in itertools.chain( @@ -373,13 +373,13 @@ def extract_insn_cross_section_cflow(xtor, insn): # this is a feature that's most relevant at the function scope, # however, its most efficient to extract at the instruction scope. -def extract_function_calls_from(xtor, insn): +def extract_function_calls_from(xtor, f, bb, insn): raise NotImplementedError() # this is a feature that's most relevant at the function or basic block scope, # however, its most efficient to extract at the instruction scope. -def extract_function_indirect_call_characteristic_features(xtor, insn): +def extract_function_indirect_call_characteristic_features(xtor, f, bb, insn): """ extract indirect function call characteristic (e.g., call eax or call dword ptr [edx+4]) does not include calls like => call ds:dword_ABD4974 @@ -390,10 +390,10 @@ def extract_function_indirect_call_characteristic_features(xtor, insn): _not_implemented = set([]) -def extract_insn_features(xtor, insn): +def extract_insn_features(xtor, f, bb, insn): for insn_handler in INSTRUCTION_HANDLERS: try: - for feature, va in insn_handler(xtor, insn): + for feature, va in insn_handler(xtor, f, bb, insn): yield feature, va except NotImplementedError: if insn_handler.__name__ not in _not_implemented: From c2f55fad1218c67a412c7e67dd9dc1cfc3ff76fa Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 18:05:08 -0600 Subject: [PATCH 17/44] tests: lancelot: construct assert message --- tests/test_lancelot_features.py | 46 +++++++++++++-------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index f908f119..48858ef6 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -101,7 +101,12 @@ def get_basic_block(extractor, f, va): @pytest.fixture def scope(request): if request.param == "file": - return extract_file_features + + def inner(extractor): + return extract_file_features(extractor) + + inner.__name__ = request.param + return inner elif "bb=" in request.param: # like `function=0x401000,bb=0x40100A` fspec, _, bbspec = request.param.partition(",") @@ -113,6 +118,7 @@ def scope(request): bb = get_basic_block(extractor, f, bbva) return extract_basic_block_features(extractor, f, bb) + inner.__name__ = request.param return inner elif request.param.startswith("function"): # like `function=0x401000` @@ -122,6 +128,7 @@ def scope(request): f = get_function(extractor, va) return extract_function_features(extractor, f) + inner.__name__ = request.param return inner else: raise ValueError("unexpected scope fixture") @@ -273,13 +280,22 @@ def parametrize(params, values, **kwargs): ("mimikatz", "function=0x40105D", capa.features.Bytes("nope".encode("ascii")), False), # insn/bytes, pointer to bytes ("mimikatz", "function=0x44EDEF", capa.features.Bytes("INPUTEVENT".encode("utf-16le")), True), + # insn/characteristic(nzxor) + ("mimikatz", "function=0x410DFC", capa.features.Characteristic("nzxor"), True), + ("mimikatz", "function=0x40105D", capa.features.Characteristic("nzxor"), False), + # insn/characteristic(nzxor): no security cookies + ("mimikatz", "function=0x46B67A", capa.features.Characteristic("nzxor"), False), ], indirect=["sample", "scope"], ) def test_lancelot_features(sample, scope, feature, expected): extractor = get_lancelot_extractor(sample) features = scope(extractor) - assert feature.evaluate(features) == expected + if expected: + msg = "%s should be found in %s" % (str(feature), scope.__name__) + else: + msg = "%s should not be found in %s" % (str(feature), scope.__name__) + assert feature.evaluate(features) == expected, msg """ @@ -288,32 +304,6 @@ def test_nzxor_features(mimikatz): features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x410DFC)) assert capa.features.Characteristic("nzxor") in features # 0x0410F0B - -def get_bb_insn(f, va): - # fetch the BasicBlock and Instruction instances for the given VA in the given function. - for bb in f.basic_blocks: - for insn in bb.instructions: - if insn.va == va: - return (bb, insn) - raise KeyError(va) - - -def test_is_security_cookie(mimikatz): - # not a security cookie check - f = lancelot_utils.Function(mimikatz.ws, 0x410DFC) - for va in [0x0410F0B]: - bb, insn = get_bb_insn(f, va) - assert capa.features.extractors.lancelot.insn.is_security_cookie(f, bb, insn) == False - - # security cookie initial set and final check - f = lancelot_utils.Function(mimikatz.ws, 0x46C54A) - for va in [0x46C557, 0x46C63A]: - bb, insn = get_bb_insn(f, va) - assert capa.features.extractors.lancelot.insn.is_security_cookie(f, bb, insn) == True - - - - def test_peb_access_features(sample_a933a1a402775cfa94b6bee0963f4b46): features = extract_function_features(lancelot_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.ws, 0xABA6FEC)) assert capa.features.Characteristic("peb access") in features From e7bf5bfceb2486bd84733959a6f5cb6dc733b725 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 18:05:26 -0600 Subject: [PATCH 18/44] lancelot: insn: nzxor --- capa/features/extractors/lancelot/insn.py | 44 ++++++++++++++++++----- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index a572ddf7..041bd040 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -22,7 +22,7 @@ from lancelot import ( ) import capa.features.extractors.helpers -from capa.features import ARCH_X32, ARCH_X64, MAX_BYTES_FEATURE_SIZE, Bytes, String +from capa.features import ARCH_X32, ARCH_X64, MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic from capa.features.insn import Number, Offset, Mnemonic logger = logging.getLogger(__name__) @@ -337,34 +337,62 @@ def extract_insn_string_features(xtor, f, bb, insn): yield String(s.s), va -def is_security_cookie(ws, insn): +def is_security_cookie(xtor, f, bb, insn): """ check if an instruction is related to security cookie checks """ - raise NotImplementedError() + op1 = insn.operands[1] + if op1[OPERAND_TYPE] == OPERAND_TYPE_REGISTER and op1[REGISTER_OPERAND_REGISTER] not in ( + "esp", + "ebp", + "rbp", + "rsp", + ): + return False + + # expect security cookie init in first basic block within first bytes (instructions) + if f == bb.address and insn.address < (bb.address + SECURITY_COOKIE_BYTES_DELTA): + return True + + # ... or within last bytes (instructions) before a return + insns = list(xtor.get_instructions(f, bb)) + if insns[-1].mnemonic in ("ret", "retn") and insn.address > (bb.address + bb.length - SECURITY_COOKIE_BYTES_DELTA): + return True + + return False -def extract_insn_nzxor_characteristic_features(xtor, insn): +def extract_insn_nzxor_characteristic_features(xtor, f, bb, insn): """ parse non-zeroing XOR instruction from the given instruction. ignore expected non-zeroing XORs, e.g. security cookies. """ - raise NotImplementedError() + if insn.mnemonic != "xor": + return + + operands = insn.operands + if operands[0] == operands[1]: + return + + if is_security_cookie(xtor, f, bb, insn): + return + + yield Characteristic("nzxor"), insn.address -def extract_insn_peb_access_characteristic_features(xtor, insn): +def extract_insn_peb_access_characteristic_features(xtor, f, bb, insn): """ parse peb access from the given function. fs:[0x30] on x86, gs:[0x60] on x64 """ raise NotImplementedError() -def extract_insn_segment_access_features(xtor, insn): +def extract_insn_segment_access_features(xtor, f, bb, insn): """ parse the instruction for access to fs or gs """ raise NotImplementedError() -def extract_insn_cross_section_cflow(xtor, insn): +def extract_insn_cross_section_cflow(xtor, f, bb, insn): """ inspect the instruction for a CALL or JMP that crosses section boundaries. """ From 5929c0652c57a75a6b1f3052bb6a541963cd537b Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 18:15:10 -0600 Subject: [PATCH 19/44] lancelot: insn: fs/gs --- capa/features/extractors/lancelot/insn.py | 23 +++++++++++++++++++++-- tests/test_lancelot_features.py | 16 ++++++---------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 041bd040..158d9ab7 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -15,6 +15,7 @@ from lancelot import ( MEMORY_OPERAND_DISP, OPERAND_TYPE_MEMORY, OPERAND_TYPE_REGISTER, + MEMORY_OPERAND_SEGMENT, OPERAND_TYPE_IMMEDIATE, IMMEDIATE_OPERAND_VALUE, REGISTER_OPERAND_REGISTER, @@ -384,12 +385,30 @@ def extract_insn_peb_access_characteristic_features(xtor, f, bb, insn): """ parse peb access from the given function. fs:[0x30] on x86, gs:[0x60] on x64 """ - raise NotImplementedError() + for operand in insn.operands: + if ( + operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY + and operand[MEMORY_OPERAND_SEGMENT] == "gs" + and operand[MEMORY_OPERAND_DISP] == 0x60 + ): + yield Characteristic("peb access"), insn.address + + if ( + operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY + and operand[MEMORY_OPERAND_SEGMENT] == "fs" + and operand[MEMORY_OPERAND_DISP] == 0x30 + ): + yield Characteristic("peb access"), insn.address def extract_insn_segment_access_features(xtor, f, bb, insn): """ parse the instruction for access to fs or gs """ - raise NotImplementedError() + for operand in insn.operands: + if operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and operand[MEMORY_OPERAND_SEGMENT] == "gs": + yield Characteristic("gs access"), insn.address + + if operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and operand[MEMORY_OPERAND_SEGMENT] == "fs": + yield Characteristic("fs access"), insn.address def extract_insn_cross_section_cflow(xtor, f, bb, insn): diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index 48858ef6..bb399cb0 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -285,6 +285,12 @@ def parametrize(params, values, **kwargs): ("mimikatz", "function=0x40105D", capa.features.Characteristic("nzxor"), False), # insn/characteristic(nzxor): no security cookies ("mimikatz", "function=0x46B67A", capa.features.Characteristic("nzxor"), False), + # insn/characteristic(peb access) + ("kernel32-64", "function=0x180001068", capa.features.Characteristic("peb access"), True), + ("mimikatz", "function=0x46B67A", capa.features.Characteristic("peb access"), False), + # insn/characteristic(gs access) + ("kernel32-64", "function=0x180001068", capa.features.Characteristic("gs access"), True), + ("mimikatz", "function=0x46B67A", capa.features.Characteristic("gs access"), False), ], indirect=["sample", "scope"], ) @@ -299,16 +305,6 @@ def test_lancelot_features(sample, scope, feature, expected): """ - -def test_nzxor_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x410DFC)) - assert capa.features.Characteristic("nzxor") in features # 0x0410F0B - -def test_peb_access_features(sample_a933a1a402775cfa94b6bee0963f4b46): - features = extract_function_features(lancelot_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.ws, 0xABA6FEC)) - assert capa.features.Characteristic("peb access") in features - - def test_tight_loop_features(mimikatz): f = lancelot_utils.Function(mimikatz.ws, 0x402EC4) for bb in f.basic_blocks: From 21adb2b9d14d7ae4368bd1b29890ffa0fbbafa4e Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 18:16:14 -0600 Subject: [PATCH 20/44] tests: lancelot: formatting --- tests/test_lancelot_features.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index bb399cb0..e46f9dfb 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -251,20 +251,10 @@ def parametrize(params, values, **kwargs): ("mimikatz", "function=0x4556E5", capa.features.insn.API("advapi32.LsaQueryInformationPolicy"), True), ("mimikatz", "function=0x4556E5", capa.features.insn.API("LsaQueryInformationPolicy"), True), # insn/api: x64 - ( - "kernel32-64", - "function=0x180001010", - capa.features.insn.API("api-ms-win-core-rtlsupport-l1-1-0.RtlVirtualUnwind"), - True, - ), + ("kernel32-64", "function=0x180001010", capa.features.insn.API("RtlVirtualUnwind"), True,), ("kernel32-64", "function=0x180001010", capa.features.insn.API("RtlVirtualUnwind"), True), # insn/api: x64 thunk - ( - "kernel32-64", - "function=0x1800202B0", - capa.features.insn.API("api-ms-win-core-rtlsupport-l1-1-0.RtlCaptureContext"), - True, - ), + ("kernel32-64", "function=0x1800202B0", capa.features.insn.API("RtlCaptureContext"), True,), ("kernel32-64", "function=0x1800202B0", capa.features.insn.API("RtlCaptureContext"), True), # insn/string ("mimikatz", "function=0x40105D", capa.features.String("SCardControl"), True), From fdd6f7434bbb78b4be4d6299115fb1d890f0ab38 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 18:40:45 -0600 Subject: [PATCH 21/44] lancelot: insn: xsection flow and recursive calls --- capa/features/extractors/lancelot/insn.py | 63 ++++++++++++++++++++--- tests/test_lancelot_features.py | 55 +++++--------------- 2 files changed, 67 insertions(+), 51 deletions(-) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 158d9ab7..0a5ac838 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -269,11 +269,9 @@ def read_bytes(xtor, va): raise ValueError("invalid address") -def extract_insn_bytes_features(xtor, f, bb, insn): - """ - parse byte sequence features from the given instruction. - """ - if insn.mnemonic == ( +# these are mnemonics that may flow (jump) elsewhere +FLOW_MNEMONICS = set( + [ "call", "jb", "jbe", @@ -297,7 +295,15 @@ def extract_insn_bytes_features(xtor, f, bb, insn): "jrcxz", "js", "jz", - ): + ] +) + + +def extract_insn_bytes_features(xtor, f, bb, insn): + """ + parse byte sequence features from the given instruction. + """ + if insn.mnemonic in FLOW_MNEMONICS: return for operand in insn.operands: @@ -411,17 +417,58 @@ def extract_insn_segment_access_features(xtor, f, bb, insn): yield Characteristic("fs access"), insn.address +def get_section(xtor, va): + pe = get_pefile(xtor) + + for i, section in enumerate(pe.sections): + section_start = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress + section_end = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress + section.Misc_VirtualSize + + if section_start <= va < section_end: + return i + + raise ValueError("invalid address") + + def extract_insn_cross_section_cflow(xtor, f, bb, insn): """ inspect the instruction for a CALL or JMP that crosses section boundaries. """ - raise NotImplementedError() + if insn.mnemonic not in FLOW_MNEMONICS: + return + + try: + target = get_operand_target(insn, insn.operands[0]) + except ValueError: + return + + if target in get_imports(xtor): + return + + try: + if get_section(xtor, insn.address) != get_section(xtor, target): + yield Characteristic("cross section flow"), insn.address + except ValueError: + return # this is a feature that's most relevant at the function scope, # however, its most efficient to extract at the instruction scope. def extract_function_calls_from(xtor, f, bb, insn): - raise NotImplementedError() + if insn.mnemonic != "call": + return + + try: + target = get_operand_target(insn, insn.operands[0]) + except ValueError: + return + + if target in get_imports(xtor): + return + + yield Characteristic("calls from"), target + if target == f: + yield Characteristic("recursive call"), target # this is a feature that's most relevant at the function or basic block scope, diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index e46f9dfb..68b6e2ea 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -80,6 +80,10 @@ def sample(request): return os.path.join(CD, "data", "kernel32-64.dll_") elif request.param == "pma12-04": return os.path.join(CD, "data", "Practical Malware Analysis Lab 12-04.exe_") + elif request.param.startswith("a1982"): + return os.path.join(CD, "data", "a198216798ca38f280dc413f8c57f2c2.exe_") + elif request.param.startswith("39c05"): + return os.path.join(CD, "data", "39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.dll_") else: raise ValueError("unexpected sample fixture") @@ -281,6 +285,14 @@ def parametrize(params, values, **kwargs): # insn/characteristic(gs access) ("kernel32-64", "function=0x180001068", capa.features.Characteristic("gs access"), True), ("mimikatz", "function=0x46B67A", capa.features.Characteristic("gs access"), False), + # insn/characteristic(cross section flow) + ("a1982...", "function=0x4014D0", capa.features.Characteristic("cross section flow"), True), + # insn/characteristic(cross section flow): imports don't count + ("kernel32-64", "function=0x180001068", capa.features.Characteristic("cross section flow"), False), + ("mimikatz", "function=0x46B67A", capa.features.Characteristic("cross section flow"), False), + # insn/characteristic(recursive call) + ("39c05...", "function=0x10003100", capa.features.Characteristic("recursive call"), True), + ("mimikatz", "function=0x46B67A", capa.features.Characteristic("recursive call"), False), ], indirect=["sample", "scope"], ) @@ -295,49 +307,6 @@ def test_lancelot_features(sample, scope, feature, expected): """ -def test_tight_loop_features(mimikatz): - f = lancelot_utils.Function(mimikatz.ws, 0x402EC4) - for bb in f.basic_blocks: - if bb.va != 0x402F8E: - continue - features = extract_basic_block_features(f, bb) - assert capa.features.Characteristic("tight loop") in features - assert capa.features.basicblock.BasicBlock() in features - - -def test_tight_loop_bb_features(mimikatz): - f = lancelot_utils.Function(mimikatz.ws, 0x402EC4) - for bb in f.basic_blocks: - if bb.va != 0x402F8E: - continue - features = extract_basic_block_features(f, bb) - assert capa.features.Characteristic("tight loop") in features - assert capa.features.basicblock.BasicBlock() in features - - -def test_cross_section_flow_features(sample_a198216798ca38f280dc413f8c57f2c2): - features = extract_function_features(lancelot_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.ws, 0x4014D0)) - assert capa.features.Characteristic("cross section flow") in features - - # this function has calls to some imports, - # which should not trigger cross-section flow characteristic - features = extract_function_features(lancelot_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.ws, 0x401563)) - assert capa.features.Characteristic("cross section flow") not in features - - -def test_segment_access_features(sample_a933a1a402775cfa94b6bee0963f4b46): - features = extract_function_features(lancelot_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.ws, 0xABA6FEC)) - assert capa.features.Characteristic("fs access") in features - - -def test_switch_features(mimikatz): - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x409411)) - assert capa.features.Characteristic("switch") in features - - features = extract_function_features(lancelot_utils.Function(mimikatz.ws, 0x409393)) - assert capa.features.Characteristic("switch") not in features - - def test_recursive_call_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41): features = extract_function_features( lancelot_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.ws, 0x10003100) From bdf6b69be6146c29a8f837e40f80631a0e08c985 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 18:45:58 -0600 Subject: [PATCH 22/44] lancelot: insn: indirect call --- capa/features/extractors/lancelot/insn.py | 12 +++++++++++- tests/test_lancelot_features.py | 15 +++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 0a5ac838..381f791c 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -14,6 +14,7 @@ from lancelot import ( MEMORY_OPERAND_BASE, MEMORY_OPERAND_DISP, OPERAND_TYPE_MEMORY, + MEMORY_OPERAND_INDEX, OPERAND_TYPE_REGISTER, MEMORY_OPERAND_SEGMENT, OPERAND_TYPE_IMMEDIATE, @@ -478,7 +479,16 @@ def extract_function_indirect_call_characteristic_features(xtor, f, bb, insn): extract indirect function call characteristic (e.g., call eax or call dword ptr [edx+4]) does not include calls like => call ds:dword_ABD4974 """ - raise NotImplementedError() + if insn.mnemonic != "call": + return + + op0 = insn.operands[0] + if op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER: + yield Characteristic("indirect call"), insn.address + elif op0[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and op0[MEMORY_OPERAND_BASE] is not None: + yield Characteristic("indirect call"), insn.address + elif op0[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and op0[MEMORY_OPERAND_INDEX] is not None: + yield Characteristic("indirect call"), insn.address _not_implemented = set([]) diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index 68b6e2ea..f2511955 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -293,6 +293,9 @@ def parametrize(params, values, **kwargs): # insn/characteristic(recursive call) ("39c05...", "function=0x10003100", capa.features.Characteristic("recursive call"), True), ("mimikatz", "function=0x46B67A", capa.features.Characteristic("recursive call"), False), + # insn/characteristic(indirect call) + ("mimikatz", "function=0x4175FF", capa.features.Characteristic("indirect call"), True), + ("mimikatz", "function=0x46B67A", capa.features.Characteristic("indirect call"), False), ], indirect=["sample", "scope"], ) @@ -307,18 +310,6 @@ def test_lancelot_features(sample, scope, feature, expected): """ -def test_recursive_call_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41): - features = extract_function_features( - lancelot_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.ws, 0x10003100) - ) - assert capa.features.Characteristic("recursive call") in features - - features = extract_function_features( - lancelot_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.ws, 0x10007B00) - ) - assert capa.features.Characteristic("recursive call") not in features - - def test_loop_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41): features = extract_function_features( lancelot_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.ws, 0x10003D30) From c739caee402feca113070c5e110dd0331d911750 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 18:53:40 -0600 Subject: [PATCH 23/44] tests: lancelot: add tests for counts of features --- capa/features/extractors/lancelot/insn.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 381f791c..cc590097 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -464,9 +464,6 @@ def extract_function_calls_from(xtor, f, bb, insn): except ValueError: return - if target in get_imports(xtor): - return - yield Characteristic("calls from"), target if target == f: yield Characteristic("recursive call"), target From db45a06ba755d281d1895d1cc396a14c6eaa74be Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 19:04:09 -0600 Subject: [PATCH 24/44] lancelot: insn: simplify operand target fetching --- capa/features/extractors/lancelot/insn.py | 121 +++++++++------------- 1 file changed, 49 insertions(+), 72 deletions(-) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index cc590097..3af2f5a9 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -65,6 +65,33 @@ def get_imports(xtor): return imports +def get_operand_target(insn, op): + if insn.address in (0x455A41, 0x46AE12): + print(insn, insn.operands) + + if op[OPERAND_TYPE] == OPERAND_TYPE_MEMORY: + # call direct, x64 + # rip relative + # kernel32-64:180001041 call cs:__imp_RtlVirtualUnwind_0 + if op[MEMORY_OPERAND_BASE] == "rip": + return op[MEMORY_OPERAND_DISP] + insn.address + insn.length + + # call direct, x32 + # mimikatz:0x403BD3 call ds:CryptAcquireContextW + elif op[MEMORY_OPERAND_BASE] == None: + return op[MEMORY_OPERAND_DISP] + + # call via thunk + # mimikatz:0x455A41 call LsaQueryInformationPolicy + elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE and op[IMMEDIATE_OPERAND_IS_RELATIVE]: + return op[IMMEDIATE_OPERAND_VALUE] + insn.address + insn.length + + elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE: + return op[IMMEDIATE_OPERAND_VALUE] + + raise ValueError("memory operand has no target") + + @lru_cache def get_thunks(xtor): thunks = {} @@ -79,29 +106,17 @@ def get_thunks(xtor): op0 = insn.operands[0] - if op0[OPERAND_TYPE] == OPERAND_TYPE_MEMORY: - target = op0[MEMORY_OPERAND_DISP] - - # direct, x64, rip relative - # 180020570 FF 25 DA 83 05 00 jmp cs:RtlCaptureContext_0 - if op0[MEMORY_OPERAND_BASE] == "rip": - target = op0[MEMORY_OPERAND_DISP] + insn.address + insn.length - - # direct, x32 - # mimikatz:.text:0046AE12 FF 25 54 30 47 00 jmp ds:__imp_LsaQueryInformationPolicy - elif op0[MEMORY_OPERAND_BASE] == None: - target = op0[MEMORY_OPERAND_DISP] - - else: - continue - - imports = get_imports(xtor) - if target not in imports: - continue - - thunks[va] = imports[target] + try: + target = get_operand_target(insn, op0) + except ValueError: continue + imports = get_imports(xtor) + if target not in imports: + continue + + thunks[va] = imports[target] + return thunks @@ -113,35 +128,21 @@ def extract_insn_api_features(xtor, f, bb, insn): op0 = insn.operands[0] - if op0[OPERAND_TYPE] == OPERAND_TYPE_MEMORY: + try: + target = get_operand_target(insn, op0) + except ValueError: + return - # call direct, x64 - # rip relative - # kernel32-64:180001041 call cs:__imp_RtlVirtualUnwind_0 - if op0[MEMORY_OPERAND_BASE] == "rip": - target = op0[MEMORY_OPERAND_DISP] + insn.address + insn.length + imports = get_imports(xtor) + if target in imports: + for feature, va in capa.features.extractors.helpers.generate_api_features(imports[target], insn.address): + yield feature, va + return - # call direct, x32 - # mimikatz:0x403BD3 call ds:CryptAcquireContextW - elif op0[MEMORY_OPERAND_BASE] == None: - target = op0[MEMORY_OPERAND_DISP] - - else: - return - - imports = get_imports(xtor) - if target in imports: - for feature, va in capa.features.extractors.helpers.generate_api_features(imports[target], insn.address): - yield feature, va - - # call via thunk - # mimikatz:0x455A41 call LsaQueryInformationPolicy - elif op0[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE and op0[IMMEDIATE_OPERAND_IS_RELATIVE]: - target = op0[IMMEDIATE_OPERAND_VALUE] + insn.address + insn.length - thunks = get_thunks(xtor) - if target in thunks: - for feature, va in capa.features.extractors.helpers.generate_api_features(thunks[target], insn.address): - yield feature, va + thunks = get_thunks(xtor) + if target in thunks: + for feature, va in capa.features.extractors.helpers.generate_api_features(thunks[target], insn.address): + yield feature, va def extract_insn_mnemonic_features(xtor, f, bb, insn): @@ -224,30 +225,6 @@ def derefs(xtor, p): p = next -def get_operand_target(insn, op): - if op[OPERAND_TYPE] == OPERAND_TYPE_MEMORY: - # call direct, x64 - # rip relative - # kernel32-64:180001041 call cs:__imp_RtlVirtualUnwind_0 - if op[MEMORY_OPERAND_BASE] == "rip": - return op[MEMORY_OPERAND_DISP] + insn.address + insn.length - - # call direct, x32 - # mimikatz:0x403BD3 call ds:CryptAcquireContextW - elif op[MEMORY_OPERAND_BASE] == None: - return op[MEMORY_OPERAND_DISP] - - # call via thunk - # mimikatz:0x455A41 call LsaQueryInformationPolicy - elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE and op[IMMEDIATE_OPERAND_IS_RELATIVE]: - return op[IMMEDIATE_OPERAND_VALUE] + insn.address + insn.length - - elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE: - return op[IMMEDIATE_OPERAND_VALUE] - - raise ValueError("memory operand has no target") - - def read_bytes(xtor, va): """ read up to MAX_BYTES_FEATURE_SIZE from the given address. From a59e1054fe6cbec7d667d1ebf32289fe6d61cc69 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 10 Aug 2020 19:04:29 -0600 Subject: [PATCH 25/44] tests: lancelot: feature counts --- tests/test_lancelot_features.py | 48 +++++++++++++-------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index f2511955..d2f1f70c 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -296,6 +296,9 @@ def parametrize(params, values, **kwargs): # insn/characteristic(indirect call) ("mimikatz", "function=0x4175FF", capa.features.Characteristic("indirect call"), True), ("mimikatz", "function=0x46B67A", capa.features.Characteristic("indirect call"), False), + # insn/characteristic(calls from) + ("mimikatz", "function=0x46B67A", capa.features.Characteristic("calls from"), True), + ("mimikatz", "function=0x40E51B", capa.features.Characteristic("calls from"), False), ], indirect=["sample", "scope"], ) @@ -309,19 +312,23 @@ def test_lancelot_features(sample, scope, feature, expected): assert feature.evaluate(features) == expected, msg +@parametrize( + "sample,scope,feature,expected", + [ + ("mimikatz", "function=0x40E51B", capa.features.basicblock.BasicBlock(), 1), + ("mimikatz", "function=0x40E5C2", capa.features.basicblock.BasicBlock(), 7), + ("mimikatz", "function=0x40E5C2", capa.features.Characteristic("calls from"), 3), + ], + indirect=["sample", "scope"], +) +def test_lancelot_feature_counts(sample, scope, feature, expected): + extractor = get_lancelot_extractor(sample) + features = scope(extractor) + msg = "%s should be found %d times in %s" % (str(feature), expected, scope.__name__) + assert len(features[feature]) == expected, msg + + """ -def test_loop_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41): - features = extract_function_features( - lancelot_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.ws, 0x10003D30) - ) - assert capa.features.Characteristic("loop") in features - - features = extract_function_features( - lancelot_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.ws, 0x10007250) - ) - assert capa.features.Characteristic("loop") not in features - - def test_function_calls_to(sample_9324d1a8ae37a36ae560c37448c9705a): features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x406F60)) assert capa.features.Characteristic("calls to") in features @@ -334,23 +341,6 @@ def test_function_calls_to64(sample_lab21_01): assert len(features[capa.features.Characteristic("calls to")]) == 8 -def test_function_calls_from(sample_9324d1a8ae37a36ae560c37448c9705a): - features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x406F60)) - assert capa.features.Characteristic("calls from") in features - assert len(features[capa.features.Characteristic("calls from")]) == 23 - - -def test_basic_block_count(sample_9324d1a8ae37a36ae560c37448c9705a): - features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x406F60)) - assert len(features[capa.features.basicblock.BasicBlock()]) == 26 - - -def test_indirect_call_features(sample_a933a1a402775cfa94b6bee0963f4b46): - features = extract_function_features(lancelot_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.ws, 0xABA68A0)) - assert capa.features.Characteristic("indirect call") in features - assert len(features[capa.features.Characteristic("indirect call")]) == 3 - - def test_indirect_calls_resolved(sample_c91887d861d9bd4a5872249b641bc9f9): features = extract_function_features(lancelot_utils.Function(sample_c91887d861d9bd4a5872249b641bc9f9.ws, 0x401A77)) assert capa.features.insn.API("kernel32.CreatePipe") in features From 5c967cd6ef27836de6e6703d4d76c95d1bc327d9 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 13 Aug 2020 12:39:32 -0600 Subject: [PATCH 26/44] lancelot: insn: calls to/from --- capa/features/extractors/lancelot/function.py | 17 +++++++++++++---- capa/features/extractors/lancelot/insn.py | 19 ++++++++++++++----- tests/test_lancelot_features.py | 13 ++++--------- 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/capa/features/extractors/lancelot/function.py b/capa/features/extractors/lancelot/function.py index 5ed6a900..e8c8cc2c 100644 --- a/capa/features/extractors/lancelot/function.py +++ b/capa/features/extractors/lancelot/function.py @@ -1,5 +1,10 @@ import logging +try: + from functools import lru_cache +except ImportError: + from backports.functools_lru_cache import lru_cache + from lancelot import ( FLOW_VA, FLOW_TYPE, @@ -14,12 +19,16 @@ from capa.features.extractors import loops logger = logging.getLogger(__name__) -def extract_function_switch(ws, f): - raise NotImplementedError() +@lru_cache +def get_call_graph(ws): + return ws.build_call_graph() def extract_function_calls_to(ws, f): - raise NotImplementedError() + cg = get_call_graph(ws) + + for caller in cg.calls_to.get(f, []): + yield Characteristic("calls to"), caller def extract_function_loop(ws, f): @@ -38,7 +47,7 @@ def extract_function_loop(ws, f): yield Characteristic("loop"), f -FUNCTION_HANDLERS = (extract_function_switch, extract_function_calls_to, extract_function_loop) +FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop) _not_implemented = set([]) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 3af2f5a9..e86e1263 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -26,6 +26,7 @@ from lancelot import ( import capa.features.extractors.helpers from capa.features import ARCH_X32, ARCH_X64, MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic from capa.features.insn import Number, Offset, Mnemonic +from capa.features.extractors.lancelot.function import get_call_graph logger = logging.getLogger(__name__) @@ -430,9 +431,17 @@ def extract_insn_cross_section_cflow(xtor, f, bb, insn): return -# this is a feature that's most relevant at the function scope, -# however, its most efficient to extract at the instruction scope. def extract_function_calls_from(xtor, f, bb, insn): + cg = get_call_graph(xtor.ws) + + for callee in cg.calls_from.get(insn.address, []): + yield Characteristic("calls from"), callee + + if callee == f: + yield Characteristic("recursive call"), insn.address + + # lancelot doesn't count API calls when constructing the call graph + # so we still have to scan for calls to an import if insn.mnemonic != "call": return @@ -441,9 +450,9 @@ def extract_function_calls_from(xtor, f, bb, insn): except ValueError: return - yield Characteristic("calls from"), target - if target == f: - yield Characteristic("recursive call"), target + imports = get_imports(xtor) + if target in imports: + yield Characteristic("calls from"), target # this is a feature that's most relevant at the function or basic block scope, diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index d2f1f70c..0f3f57bf 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -182,15 +182,6 @@ def parametrize(params, values, **kwargs): # function/characteristic(loop) ("mimikatz", "function=0x401517", capa.features.Characteristic("loop"), True), ("mimikatz", "function=0x401000", capa.features.Characteristic("loop"), False), - # function/characteristic(switch) - pytest.param( - "mimikatz", - "function=0x409411", - capa.features.Characteristic("switch"), - True, - marks=pytest.mark.xfail(reason="characteristic(switch) not implemented yet"), - ), - ("mimikatz", "function=0x401000", capa.features.Characteristic("switch"), False), # function/characteristic(calls to) pytest.param( "mimikatz", @@ -299,6 +290,10 @@ def parametrize(params, values, **kwargs): # insn/characteristic(calls from) ("mimikatz", "function=0x46B67A", capa.features.Characteristic("calls from"), True), ("mimikatz", "function=0x40E51B", capa.features.Characteristic("calls from"), False), + # function/characteristic(calls to) + ("mimikatz", "function=0x40105D", capa.features.Characteristic("calls to"), True), + ("mimikatz", "function=0x46C0D2", capa.features.Characteristic("calls to"), False), + ("mimikatz", "function=0x40E51B", capa.features.Characteristic("calls to"), False), ], indirect=["sample", "scope"], ) From c6f27200fee75961d85d9149cce3393c2e9162d2 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 13 Aug 2020 13:22:29 -0600 Subject: [PATCH 27/44] lancelot: pull get_operand_target into helpers --- capa/features/extractors/lancelot/helpers.py | 33 ++++++++++++++++++++ capa/features/extractors/lancelot/insn.py | 29 ++--------------- 2 files changed, 35 insertions(+), 27 deletions(-) create mode 100644 capa/features/extractors/lancelot/helpers.py diff --git a/capa/features/extractors/lancelot/helpers.py b/capa/features/extractors/lancelot/helpers.py new file mode 100644 index 00000000..0e755104 --- /dev/null +++ b/capa/features/extractors/lancelot/helpers.py @@ -0,0 +1,33 @@ +from lancelot import ( + OPERAND_TYPE, + MEMORY_OPERAND_BASE, + MEMORY_OPERAND_DISP, + OPERAND_TYPE_MEMORY, + OPERAND_TYPE_IMMEDIATE, + IMMEDIATE_OPERAND_VALUE, + IMMEDIATE_OPERAND_IS_RELATIVE, +) + + +def get_operand_target(insn, op): + if op[OPERAND_TYPE] == OPERAND_TYPE_MEMORY: + # call direct, x64 + # rip relative + # kernel32-64:180001041 call cs:__imp_RtlVirtualUnwind_0 + if op[MEMORY_OPERAND_BASE] == "rip": + return op[MEMORY_OPERAND_DISP] + insn.address + insn.length + + # call direct, x32 + # mimikatz:0x403BD3 call ds:CryptAcquireContextW + elif op[MEMORY_OPERAND_BASE] == None: + return op[MEMORY_OPERAND_DISP] + + # call via thunk + # mimikatz:0x455A41 call LsaQueryInformationPolicy + elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE and op[IMMEDIATE_OPERAND_IS_RELATIVE]: + return op[IMMEDIATE_OPERAND_VALUE] + insn.address + insn.length + + elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE: + return op[IMMEDIATE_OPERAND_VALUE] + + raise ValueError("memory operand has no target") diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index e86e1263..63d2353d 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -26,7 +26,9 @@ from lancelot import ( import capa.features.extractors.helpers from capa.features import ARCH_X32, ARCH_X64, MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic from capa.features.insn import Number, Offset, Mnemonic +from capa.features.extractors.lancelot.helpers import get_operand_target from capa.features.extractors.lancelot.function import get_call_graph +from capa.features.extractors.lancelot.indirect_calls import NotFoundError, resolve_indirect_call logger = logging.getLogger(__name__) @@ -66,33 +68,6 @@ def get_imports(xtor): return imports -def get_operand_target(insn, op): - if insn.address in (0x455A41, 0x46AE12): - print(insn, insn.operands) - - if op[OPERAND_TYPE] == OPERAND_TYPE_MEMORY: - # call direct, x64 - # rip relative - # kernel32-64:180001041 call cs:__imp_RtlVirtualUnwind_0 - if op[MEMORY_OPERAND_BASE] == "rip": - return op[MEMORY_OPERAND_DISP] + insn.address + insn.length - - # call direct, x32 - # mimikatz:0x403BD3 call ds:CryptAcquireContextW - elif op[MEMORY_OPERAND_BASE] == None: - return op[MEMORY_OPERAND_DISP] - - # call via thunk - # mimikatz:0x455A41 call LsaQueryInformationPolicy - elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE and op[IMMEDIATE_OPERAND_IS_RELATIVE]: - return op[IMMEDIATE_OPERAND_VALUE] + insn.address + insn.length - - elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE: - return op[IMMEDIATE_OPERAND_VALUE] - - raise ValueError("memory operand has no target") - - @lru_cache def get_thunks(xtor): thunks = {} From caf738ee4e2701135546cacc0d7d97e0c7277cc4 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 13 Aug 2020 13:22:50 -0600 Subject: [PATCH 28/44] lancelot: implement indirect call resolution --- .../extractors/lancelot/indirect_calls.py | 149 ++++++++++++++++++ capa/features/extractors/lancelot/insn.py | 16 +- tests/test_lancelot_features.py | 37 +---- 3 files changed, 168 insertions(+), 34 deletions(-) create mode 100644 capa/features/extractors/lancelot/indirect_calls.py diff --git a/capa/features/extractors/lancelot/indirect_calls.py b/capa/features/extractors/lancelot/indirect_calls.py new file mode 100644 index 00000000..e1318873 --- /dev/null +++ b/capa/features/extractors/lancelot/indirect_calls.py @@ -0,0 +1,149 @@ +# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import collections + +from lancelot import ( + FLOW_VA, + OPERAND_TYPE, + PERMISSION_READ, + MEMORY_OPERAND_BASE, + MEMORY_OPERAND_DISP, + OPERAND_TYPE_MEMORY, + MEMORY_OPERAND_INDEX, + OPERAND_TYPE_REGISTER, + MEMORY_OPERAND_SEGMENT, + OPERAND_TYPE_IMMEDIATE, + IMMEDIATE_OPERAND_VALUE, + REGISTER_OPERAND_REGISTER, + IMMEDIATE_OPERAND_IS_RELATIVE, +) + +from capa.features.extractors.lancelot.helpers import get_operand_target + +DESTRUCTIVE_MNEMONICS = ("mov", "lea", "pop", "xor") + + +class NotFoundError(Exception): + pass + + +def read_instructions(ws, bb): + va = bb.address + while va < bb.address + bb.length: + try: + insn = ws.read_insn(va) + except ValueError: + return + + yield insn + va += insn.length + + +def build_instruction_predecessors(ws, cfg): + preds = collections.defaultdict(set) + + for bb in cfg.basic_blocks.values(): + insns = list(read_instructions(ws, bb)) + + for i, insn in enumerate(insns): + if i == 0: + for pred in bb.predecessors: + pred_bb = cfg.basic_blocks[pred[FLOW_VA]] + preds[insn.address].add(list(read_instructions(ws, pred_bb))[-1].address) + else: + preds[insn.address].add(insns[i - 1].address) + + return preds + + +def find_definition(ws, f, insn): + """ + scan backwards from the given address looking for assignments to the given register. + if a constant, return that value. + args: + ws (lancelot.PE) + f (int): the function start address + insn (lancelot.Instruction): call instruction to resolve + returns: + (va: int, value?: int|None): the address of the assignment and the value, if a constant. + raises: + NotFoundError: when the definition cannot be found. + """ + assert insn.mnemonic == "call" + op0 = insn.operands[0] + assert op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER + reg = op0[REGISTER_OPERAND_REGISTER] + + cfg = ws.build_cfg(f) + preds = build_instruction_predecessors(ws, cfg) + + q = collections.deque() + seen = set([]) + q.extend(preds[insn.address]) + while q: + cur = q.popleft() + + # skip if we've already processed this location + if cur in seen: + continue + seen.add(cur) + + insn = ws.read_insn(cur) + operands = insn.operands + + if len(operands) == 0: + q.extend(preds[cur]) + continue + + op0 = operands[0] + if not ( + op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER + and op0[REGISTER_OPERAND_REGISTER] == reg + and insn.mnemonic in DESTRUCTIVE_MNEMONICS + ): + q.extend(preds[cur]) + continue + + # if we reach here, the instruction is destructive to our target register. + + # we currently only support extracting the constant from something like: `mov $reg, IAT` + # so, any other pattern results in an unknown value, represented by None. + # this is a good place to extend in the future, if we need more robust support. + if insn.mnemonic != "mov": + return (cur, None) + else: + op1 = operands[1] + try: + target = get_operand_target(insn, op1) + except ValueError: + return (cur, None) + else: + return (cur, target) + + raise NotFoundError() + + +def is_indirect_call(insn): + return insn.mnemonic == "call" and insn.operands[0][OPERAND_TYPE] == OPERAND_TYPE_REGISTER + + +def resolve_indirect_call(ws, f, insn): + """ + inspect the given indirect call instruction and attempt to resolve the target address. + args: + ws (lancelot.PE): the analysis workspace + f (int): the address of the function to analyze + insn (lancelot.Instruction): the instruction at which to start analysis + returns: + (va: int, value?: int|None): the address of the assignment and the value, if a constant. + raises: + NotFoundError: when the definition cannot be found. + """ + assert is_indirect_call(insn) + return find_definition(ws, f, insn) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 63d2353d..854ecb0d 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -104,10 +104,18 @@ def extract_insn_api_features(xtor, f, bb, insn): op0 = insn.operands[0] - try: - target = get_operand_target(insn, op0) - except ValueError: - return + if op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER: + try: + (_, target) = resolve_indirect_call(xtor.ws, f, insn) + except NotFoundError: + return + if target is None: + return + else: + try: + target = get_operand_target(insn, op0) + except ValueError: + return imports = get_imports(xtor) if target in imports: diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index 0f3f57bf..bcf3cf8c 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -84,6 +84,8 @@ def sample(request): return os.path.join(CD, "data", "a198216798ca38f280dc413f8c57f2c2.exe_") elif request.param.startswith("39c05"): return os.path.join(CD, "data", "39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.dll_") + elif request.param.startswith("c9188"): + return os.path.join(CD, "data", "c91887d861d9bd4a5872249b641bc9f9.exe_") else: raise ValueError("unexpected sample fixture") @@ -182,14 +184,6 @@ def parametrize(params, values, **kwargs): # function/characteristic(loop) ("mimikatz", "function=0x401517", capa.features.Characteristic("loop"), True), ("mimikatz", "function=0x401000", capa.features.Characteristic("loop"), False), - # function/characteristic(calls to) - pytest.param( - "mimikatz", - "function=0x401000", - capa.features.Characteristic("calls to"), - True, - marks=pytest.mark.xfail(reason="characteristic(calls to) not implemented yet"), - ), # bb/characteristic(tight loop) ("mimikatz", "function=0x402EC4", capa.features.Characteristic("tight loop"), True), ("mimikatz", "function=0x401000", capa.features.Characteristic("tight loop"), False), @@ -251,6 +245,11 @@ def parametrize(params, values, **kwargs): # insn/api: x64 thunk ("kernel32-64", "function=0x1800202B0", capa.features.insn.API("RtlCaptureContext"), True,), ("kernel32-64", "function=0x1800202B0", capa.features.insn.API("RtlCaptureContext"), True), + # insn/api: resolve indirect calls + ("c91887...", "function=0x401A77", capa.features.insn.API("kernel32.CreatePipe"), True), + ("c91887...", "function=0x401A77", capa.features.insn.API("kernel32.SetHandleInformation"), True), + ("c91887...", "function=0x401A77", capa.features.insn.API("kernel32.CloseHandle"), True), + ("c91887...", "function=0x401A77", capa.features.insn.API("kernel32.WriteFile"), True), # insn/string ("mimikatz", "function=0x40105D", capa.features.String("SCardControl"), True), ("mimikatz", "function=0x40105D", capa.features.String("SCardTransmit"), True), @@ -321,25 +320,3 @@ def test_lancelot_feature_counts(sample, scope, feature, expected): features = scope(extractor) msg = "%s should be found %d times in %s" % (str(feature), expected, scope.__name__) assert len(features[feature]) == expected, msg - - -""" -def test_function_calls_to(sample_9324d1a8ae37a36ae560c37448c9705a): - features = extract_function_features(lancelot_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.ws, 0x406F60)) - assert capa.features.Characteristic("calls to") in features - assert len(features[capa.features.Characteristic("calls to")]) == 1 - - -def test_function_calls_to64(sample_lab21_01): - features = extract_function_features(lancelot_utils.Function(sample_lab21_01.ws, 0x1400052D0)) # memcpy - assert capa.features.Characteristic("calls to") in features - assert len(features[capa.features.Characteristic("calls to")]) == 8 - - -def test_indirect_calls_resolved(sample_c91887d861d9bd4a5872249b641bc9f9): - features = extract_function_features(lancelot_utils.Function(sample_c91887d861d9bd4a5872249b641bc9f9.ws, 0x401A77)) - assert capa.features.insn.API("kernel32.CreatePipe") in features - assert capa.features.insn.API("kernel32.SetHandleInformation") in features - assert capa.features.insn.API("kernel32.CloseHandle") in features - assert capa.features.insn.API("kernel32.WriteFile") in features -""" From d5f73b47a44a04cbac48761b2ad1cdb80a83088b Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 13 Aug 2020 13:35:09 -0600 Subject: [PATCH 29/44] main: use lancelot on py3 --- capa/main.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/capa/main.py b/capa/main.py index f5a50749..ad740f64 100644 --- a/capa/main.py +++ b/capa/main.py @@ -31,7 +31,7 @@ import capa.features.extractors from capa.helpers import oint, get_file_taste RULES_PATH_DEFAULT_STRING = "(embedded rules)" -SUPPORTED_FILE_MAGIC = set(["MZ"]) +SUPPORTED_FILE_MAGIC = set([b"MZ"]) logger = logging.getLogger("capa") @@ -288,7 +288,24 @@ class UnsupportedRuntimeError(RuntimeError): def get_extractor_py3(path, format): - raise UnsupportedRuntimeError() + try: + import lancelot + + import capa.features.extractors.lancelot + except ImportError: + logger.warning("lancelot not installed") + raise UnsupportedRuntimeError() + + if format not in ("pe", "auto"): + raise UnsupportedFormatError(format) + + if not is_supported_file_type(path): + raise UnsupportedFormatError() + + with open(path, "rb") as f: + buf = f.read() + + return capa.features.extractors.lancelot.LancelotFeatureExtractor(buf) def get_extractor(path, format): From 40d16c925f5b7fe5584a800130349f60cc8d8f66 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 13 Aug 2020 17:23:36 -0600 Subject: [PATCH 30/44] main: progress bar updates (+rules, and realize iterators) --- capa/main.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/capa/main.py b/capa/main.py index ad740f64..34871682 100644 --- a/capa/main.py +++ b/capa/main.py @@ -106,7 +106,7 @@ def find_capabilities(ruleset, extractor, disable_progress=None): meta = {"feature_counts": {"file": 0, "functions": {},}} - for f in tqdm.tqdm(extractor.get_functions(), disable=disable_progress, unit=" functions"): + for f in tqdm.tqdm(list(extractor.get_functions()), disable=disable_progress, desc="matching", unit=" functions"): function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f) meta["feature_counts"]["functions"][f.__int__()] = feature_count logger.debug("analyzed function 0x%x and extracted %d features", f.__int__(), feature_count) @@ -332,7 +332,7 @@ def is_nursery_rule_path(path): return "nursery" in path -def get_rules(rule_path): +def get_rules(rule_path, disable_progress=False): if not os.path.exists(rule_path): raise IOError("rule path %s does not exist or cannot be accessed" % rule_path) @@ -360,7 +360,8 @@ def get_rules(rule_path): rule_paths.append(rule_path) rules = [] - for rule_path in rule_paths: + + for rule_path in tqdm.tqdm(list(rule_paths), disable=disable_progress, desc="loading ", unit=" rules"): try: rule = capa.rules.Rule.from_yaml_file(rule_path) except capa.rules.InvalidRule: @@ -543,7 +544,7 @@ def main(argv=None): logger.debug("using rules path: %s", rules_path) try: - rules = get_rules(rules_path) + rules = get_rules(rules_path, disable_progress=args.quiet) rules = capa.rules.RuleSet(rules) logger.debug("successfully loaded %s rules", len(rules)) if args.tag: From 1e097ef759ed08869a503d21d13d67f03a356f67 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 15 Aug 2020 11:13:06 -0600 Subject: [PATCH 31/44] setup: add pylancelot dep on py3 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index c75b05e1..0e5333bd 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,7 @@ requirements = ["six", "tqdm", "pyyaml", "tabulate", "colorama", "termcolor", "r if sys.version_info >= (3, 0): # py3 requirements.append("networkx") + requirements.append("pylancelot==0.2.1") else: # py2 requirements.append("enum34") From 8721eb05ebf0f546d8d85bc5ebcf983011e12541 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 15 Aug 2020 23:32:13 -0600 Subject: [PATCH 32/44] tests: show found number of features when unexpected --- tests/fixtures.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 21ee429d..50a766d8 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -376,7 +376,12 @@ def do_test_feature_presence(get_extractor, sample, scope, feature, expected): def do_test_feature_count(get_extractor, sample, scope, feature, expected): extractor = get_extractor(sample) features = scope(extractor) - msg = "%s should be found %d times in %s" % (str(feature), expected, scope.__name__) + msg = "%s should be found %d times in %s, found: %d" % ( + str(feature), + expected, + scope.__name__, + len(features[feature]), + ) assert len(features[feature]) == expected, msg From 980a34adca0dbf4d4749a7d18924700b8b35a13d Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 15 Aug 2020 23:32:35 -0600 Subject: [PATCH 33/44] setup: bump lancelot dep version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0e5333bd..3de02fb0 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ requirements = ["six", "tqdm", "pyyaml", "tabulate", "colorama", "termcolor", "r if sys.version_info >= (3, 0): # py3 requirements.append("networkx") - requirements.append("pylancelot==0.2.1") + requirements.append("pylancelot~=0.3.5") else: # py2 requirements.append("enum34") From c623791a8421acaea31144bc9704effd61d60044 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sat, 15 Aug 2020 23:32:50 -0600 Subject: [PATCH 34/44] tests: lancelot: use common harness from fixtures --- tests/test_lancelot_features.py | 302 +------------------------------- 1 file changed, 7 insertions(+), 295 deletions(-) diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index bcf3cf8c..96db0d83 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -6,317 +6,29 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. -import os.path -import collections -try: - from functools import lru_cache -except ImportError: - from backports.functools_lru_cache import lru_cache - -import pytest - -import capa.features -import capa.features.file -import capa.features.insn -import capa.features.basicblock -import capa.features.extractors.lancelot.file -import capa.features.extractors.lancelot.insn -import capa.features.extractors.lancelot.function -import capa.features.extractors.lancelot.basicblock -from capa.features import ARCH_X32, ARCH_X64 - -CD = os.path.dirname(__file__) - - -@lru_cache -def extract_file_features(extractor): - features = collections.defaultdict(set) - for feature, va in extractor.extract_file_features(): - features[feature].add(va) - return features - - -@lru_cache -def extract_function_features(extractor, f): - features = collections.defaultdict(set) - for bb in extractor.get_basic_blocks(f): - for insn in extractor.get_instructions(f, bb): - for feature, va in extractor.extract_insn_features(f, bb, insn): - features[feature].add(va) - for feature, va in extractor.extract_basic_block_features(f, bb): - features[feature].add(va) - for feature, va in extractor.extract_function_features(f): - features[feature].add(va) - return features - - -@lru_cache -def extract_basic_block_features(extractor, f, bb): - features = collections.defaultdict(set) - for insn in extractor.get_instructions(f, bb): - for feature, va in extractor.extract_insn_features(f, bb, insn): - features[feature].add(va) - for feature, va in extractor.extract_basic_block_features(f, bb): - features[feature].add(va) - return features +from fixtures import * @lru_cache def get_lancelot_extractor(path): + import capa.features.extractors.lancelot + with open(path, "rb") as f: buf = f.read() return capa.features.extractors.lancelot.LancelotFeatureExtractor(buf) -@pytest.fixture -def sample(request): - if request.param == "mimikatz": - return os.path.join(CD, "data", "mimikatz.exe_") - elif request.param == "kernel32": - return os.path.join(CD, "data", "kernel32.dll_") - elif request.param == "kernel32-64": - return os.path.join(CD, "data", "kernel32-64.dll_") - elif request.param == "pma12-04": - return os.path.join(CD, "data", "Practical Malware Analysis Lab 12-04.exe_") - elif request.param.startswith("a1982"): - return os.path.join(CD, "data", "a198216798ca38f280dc413f8c57f2c2.exe_") - elif request.param.startswith("39c05"): - return os.path.join(CD, "data", "39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.dll_") - elif request.param.startswith("c9188"): - return os.path.join(CD, "data", "c91887d861d9bd4a5872249b641bc9f9.exe_") - else: - raise ValueError("unexpected sample fixture") - - -def get_function(extractor, fva): - for f in extractor.get_functions(): - if f.__int__() == fva: - return f - raise ValueError("function not found") - - -def get_basic_block(extractor, f, va): - for bb in extractor.get_basic_blocks(f): - if bb.__int__() == va: - return bb - raise ValueError("basic block not found") - - -@pytest.fixture -def scope(request): - if request.param == "file": - - def inner(extractor): - return extract_file_features(extractor) - - inner.__name__ = request.param - return inner - elif "bb=" in request.param: - # like `function=0x401000,bb=0x40100A` - fspec, _, bbspec = request.param.partition(",") - fva = int(fspec.partition("=")[2], 0x10) - bbva = int(bbspec.partition("=")[2], 0x10) - - def inner(extractor): - f = get_function(extractor, fva) - bb = get_basic_block(extractor, f, bbva) - return extract_basic_block_features(extractor, f, bb) - - inner.__name__ = request.param - return inner - elif request.param.startswith("function"): - # like `function=0x401000` - va = int(request.param.partition("=")[2], 0x10) - - def inner(extractor): - f = get_function(extractor, va) - return extract_function_features(extractor, f) - - inner.__name__ = request.param - return inner - else: - raise ValueError("unexpected scope fixture") - - -def parametrize(params, values, **kwargs): - """ - extend `pytest.mark.parametrize` to pretty-print features. - by default, it renders objects as an opaque value. - ref: https://docs.pytest.org/en/2.9.0/example/parametrize.html#different-options-for-test-ids - - rendered ID might look something like: - - mimikatz-function=0x403BAC-api(CryptDestroyKey)-True - """ - ids = ["-".join(map(str, vs)) for vs in values] - return pytest.mark.parametrize(params, values, ids=ids, **kwargs) - - @parametrize( - "sample,scope,feature,expected", - [ - # file/characteristic("embedded pe") - ("pma12-04", "file", capa.features.Characteristic("embedded pe"), True), - # file/string - ("mimikatz", "file", capa.features.String("SCardControl"), True), - ("mimikatz", "file", capa.features.String("SCardTransmit"), True), - ("mimikatz", "file", capa.features.String("ACR > "), True), - ("mimikatz", "file", capa.features.String("nope"), False), - # file/sections - ("mimikatz", "file", capa.features.file.Section(".rsrc"), True), - ("mimikatz", "file", capa.features.file.Section(".text"), True), - ("mimikatz", "file", capa.features.file.Section(".nope"), False), - # file/exports - ("kernel32", "file", capa.features.file.Export("BaseThreadInitThunk"), True), - ("kernel32", "file", capa.features.file.Export("lstrlenW"), True), - ("kernel32", "file", capa.features.file.Export("nope"), False), - # file/imports - ("mimikatz", "file", capa.features.file.Import("advapi32.CryptSetHashParam"), True), - ("mimikatz", "file", capa.features.file.Import("CryptSetHashParam"), True), - ("mimikatz", "file", capa.features.file.Import("kernel32.IsWow64Process"), True), - ("mimikatz", "file", capa.features.file.Import("msvcrt.exit"), True), - ("mimikatz", "file", capa.features.file.Import("cabinet.#11"), True), - ("mimikatz", "file", capa.features.file.Import("#11"), False), - ("mimikatz", "file", capa.features.file.Import("#nope"), False), - ("mimikatz", "file", capa.features.file.Import("nope"), False), - # function/characteristic(loop) - ("mimikatz", "function=0x401517", capa.features.Characteristic("loop"), True), - ("mimikatz", "function=0x401000", capa.features.Characteristic("loop"), False), - # bb/characteristic(tight loop) - ("mimikatz", "function=0x402EC4", capa.features.Characteristic("tight loop"), True), - ("mimikatz", "function=0x401000", capa.features.Characteristic("tight loop"), False), - # bb/characteristic(stack string) - ("mimikatz", "function=0x4556E5", capa.features.Characteristic("stack string"), True), - ("mimikatz", "function=0x401000", capa.features.Characteristic("stack string"), False), - # bb/characteristic(tight loop) - ("mimikatz", "function=0x402EC4,bb=0x402F8E", capa.features.Characteristic("tight loop"), True), - ("mimikatz", "function=0x401000,bb=0x401000", capa.features.Characteristic("tight loop"), False), - # insn/mnemonic - ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("push"), True), - ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("movzx"), True), - ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("xor"), True), - ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("in"), False), - ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("out"), False), - # insn/number - ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF), True), - ("mimikatz", "function=0x40105D", capa.features.insn.Number(0x3136B0), True), - # insn/number: stack adjustments - ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xC), False), - ("mimikatz", "function=0x40105D", capa.features.insn.Number(0x10), False), - # insn/number: arch flavors - ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF), True), - ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF, arch=ARCH_X32), True), - ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF, arch=ARCH_X64), False), - # insn/offset - ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x0), True), - ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x4), True), - ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0xC), True), - # insn/offset: stack references - ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x8), False), - ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x10), False), - # insn/offset: negative - ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x1), True), - ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x2), True), - # insn/offset: arch flavors - ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x0), True), - ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x0, arch=ARCH_X32), True), - ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x0, arch=ARCH_X64), False), - # insn/api - ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContextW"), True), - ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContext"), True), - ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptGenKey"), True), - ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptImportKey"), True), - ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptDestroyKey"), True), - ("mimikatz", "function=0x403BAC", capa.features.insn.API("CryptAcquireContextW"), True), - ("mimikatz", "function=0x403BAC", capa.features.insn.API("CryptAcquireContext"), True), - ("mimikatz", "function=0x403BAC", capa.features.insn.API("CryptGenKey"), True), - ("mimikatz", "function=0x403BAC", capa.features.insn.API("CryptImportKey"), True), - ("mimikatz", "function=0x403BAC", capa.features.insn.API("CryptDestroyKey"), True), - ("mimikatz", "function=0x403BAC", capa.features.insn.API("Nope"), False), - ("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.Nope"), False), - # insn/api: thunk - ("mimikatz", "function=0x4556E5", capa.features.insn.API("advapi32.LsaQueryInformationPolicy"), True), - ("mimikatz", "function=0x4556E5", capa.features.insn.API("LsaQueryInformationPolicy"), True), - # insn/api: x64 - ("kernel32-64", "function=0x180001010", capa.features.insn.API("RtlVirtualUnwind"), True,), - ("kernel32-64", "function=0x180001010", capa.features.insn.API("RtlVirtualUnwind"), True), - # insn/api: x64 thunk - ("kernel32-64", "function=0x1800202B0", capa.features.insn.API("RtlCaptureContext"), True,), - ("kernel32-64", "function=0x1800202B0", capa.features.insn.API("RtlCaptureContext"), True), - # insn/api: resolve indirect calls - ("c91887...", "function=0x401A77", capa.features.insn.API("kernel32.CreatePipe"), True), - ("c91887...", "function=0x401A77", capa.features.insn.API("kernel32.SetHandleInformation"), True), - ("c91887...", "function=0x401A77", capa.features.insn.API("kernel32.CloseHandle"), True), - ("c91887...", "function=0x401A77", capa.features.insn.API("kernel32.WriteFile"), True), - # insn/string - ("mimikatz", "function=0x40105D", capa.features.String("SCardControl"), True), - ("mimikatz", "function=0x40105D", capa.features.String("SCardTransmit"), True), - ("mimikatz", "function=0x40105D", capa.features.String("ACR > "), True), - ("mimikatz", "function=0x40105D", capa.features.String("nope"), False), - # insn/string, pointer to string - ("mimikatz", "function=0x44EDEF", capa.features.String("INPUTEVENT"), True), - # insn/bytes - ("mimikatz", "function=0x40105D", capa.features.Bytes("SCardControl".encode("utf-16le")), True), - ("mimikatz", "function=0x40105D", capa.features.Bytes("SCardTransmit".encode("utf-16le")), True), - ("mimikatz", "function=0x40105D", capa.features.Bytes("ACR > ".encode("utf-16le")), True), - ("mimikatz", "function=0x40105D", capa.features.Bytes("nope".encode("ascii")), False), - # insn/bytes, pointer to bytes - ("mimikatz", "function=0x44EDEF", capa.features.Bytes("INPUTEVENT".encode("utf-16le")), True), - # insn/characteristic(nzxor) - ("mimikatz", "function=0x410DFC", capa.features.Characteristic("nzxor"), True), - ("mimikatz", "function=0x40105D", capa.features.Characteristic("nzxor"), False), - # insn/characteristic(nzxor): no security cookies - ("mimikatz", "function=0x46B67A", capa.features.Characteristic("nzxor"), False), - # insn/characteristic(peb access) - ("kernel32-64", "function=0x180001068", capa.features.Characteristic("peb access"), True), - ("mimikatz", "function=0x46B67A", capa.features.Characteristic("peb access"), False), - # insn/characteristic(gs access) - ("kernel32-64", "function=0x180001068", capa.features.Characteristic("gs access"), True), - ("mimikatz", "function=0x46B67A", capa.features.Characteristic("gs access"), False), - # insn/characteristic(cross section flow) - ("a1982...", "function=0x4014D0", capa.features.Characteristic("cross section flow"), True), - # insn/characteristic(cross section flow): imports don't count - ("kernel32-64", "function=0x180001068", capa.features.Characteristic("cross section flow"), False), - ("mimikatz", "function=0x46B67A", capa.features.Characteristic("cross section flow"), False), - # insn/characteristic(recursive call) - ("39c05...", "function=0x10003100", capa.features.Characteristic("recursive call"), True), - ("mimikatz", "function=0x46B67A", capa.features.Characteristic("recursive call"), False), - # insn/characteristic(indirect call) - ("mimikatz", "function=0x4175FF", capa.features.Characteristic("indirect call"), True), - ("mimikatz", "function=0x46B67A", capa.features.Characteristic("indirect call"), False), - # insn/characteristic(calls from) - ("mimikatz", "function=0x46B67A", capa.features.Characteristic("calls from"), True), - ("mimikatz", "function=0x40E51B", capa.features.Characteristic("calls from"), False), - # function/characteristic(calls to) - ("mimikatz", "function=0x40105D", capa.features.Characteristic("calls to"), True), - ("mimikatz", "function=0x46C0D2", capa.features.Characteristic("calls to"), False), - ("mimikatz", "function=0x40E51B", capa.features.Characteristic("calls to"), False), - ], - indirect=["sample", "scope"], + "sample,scope,feature,expected", FEATURE_PRESENCE_TESTS, indirect=["sample", "scope"], ) def test_lancelot_features(sample, scope, feature, expected): - extractor = get_lancelot_extractor(sample) - features = scope(extractor) - if expected: - msg = "%s should be found in %s" % (str(feature), scope.__name__) - else: - msg = "%s should not be found in %s" % (str(feature), scope.__name__) - assert feature.evaluate(features) == expected, msg + do_test_feature_presence(get_lancelot_extractor, sample, scope, feature, expected) @parametrize( - "sample,scope,feature,expected", - [ - ("mimikatz", "function=0x40E51B", capa.features.basicblock.BasicBlock(), 1), - ("mimikatz", "function=0x40E5C2", capa.features.basicblock.BasicBlock(), 7), - ("mimikatz", "function=0x40E5C2", capa.features.Characteristic("calls from"), 3), - ], - indirect=["sample", "scope"], + "sample,scope,feature,expected", FEATURE_COUNT_TESTS, indirect=["sample", "scope"], ) def test_lancelot_feature_counts(sample, scope, feature, expected): - extractor = get_lancelot_extractor(sample) - features = scope(extractor) - msg = "%s should be found %d times in %s" % (str(feature), expected, scope.__name__) - assert len(features[feature]) == expected, msg + do_test_feature_count(get_lancelot_extractor, sample, scope, feature, expected) From 09bca1e5f7e3499295cc740ac82ab0849d7ce839 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sun, 16 Aug 2020 00:04:39 -0600 Subject: [PATCH 35/44] setup: bump lancelot dep version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3de02fb0..76e5de7d 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ requirements = ["six", "tqdm", "pyyaml", "tabulate", "colorama", "termcolor", "r if sys.version_info >= (3, 0): # py3 requirements.append("networkx") - requirements.append("pylancelot~=0.3.5") + requirements.append("pylancelot~=0.3.6") else: # py2 requirements.append("enum34") From 9fa128b27d89507a990154b5bfa2689590e4e4f2 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sun, 16 Aug 2020 00:05:04 -0600 Subject: [PATCH 36/44] tests: freeze: make py3 compatible --- capa/features/extractors/lancelot/__init__.py | 1 + tests/test_freeze.py | 14 ++++---------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/capa/features/extractors/lancelot/__init__.py b/capa/features/extractors/lancelot/__init__.py index bda1c89b..b8dbe104 100644 --- a/capa/features/extractors/lancelot/__init__.py +++ b/capa/features/extractors/lancelot/__init__.py @@ -63,6 +63,7 @@ class LancelotFeatureExtractor(capa.features.extractors.FeatureExtractor): def get_functions(self): for va in self.ws.get_functions(): + # this is just the address of the function yield va def extract_function_features(self, f): diff --git a/tests/test_freeze.py b/tests/test_freeze.py index 9ad92237..cbd31ad4 100644 --- a/tests/test_freeze.py +++ b/tests/test_freeze.py @@ -104,17 +104,14 @@ def compare_extractors_viv_null(viv_ext, null_ext): viv_ext (capa.features.extractors.viv.VivisectFeatureExtractor) null_ext (capa.features.extractors.NullFeatureExtractor) """ - - # TODO: ordering of these things probably doesn't work yet - assert list(viv_ext.extract_file_features()) == list(null_ext.extract_file_features()) - assert to_int(list(viv_ext.get_functions())) == list(null_ext.get_functions()) + assert list(map(to_int, viv_ext.get_functions())) == list(null_ext.get_functions()) for f in viv_ext.get_functions(): - assert to_int(list(viv_ext.get_basic_blocks(f))) == list(null_ext.get_basic_blocks(to_int(f))) + assert list(map(to_int, viv_ext.get_basic_blocks(f))) == list(null_ext.get_basic_blocks(to_int(f))) assert list(viv_ext.extract_function_features(f)) == list(null_ext.extract_function_features(to_int(f))) for bb in viv_ext.get_basic_blocks(f): - assert to_int(list(viv_ext.get_instructions(f, bb))) == list( + assert list(map(to_int, viv_ext.get_instructions(f, bb))) == list( null_ext.get_instructions(to_int(f), to_int(bb)) ) assert list(viv_ext.extract_basic_block_features(f, bb)) == list( @@ -129,10 +126,7 @@ def compare_extractors_viv_null(viv_ext, null_ext): def to_int(o): """helper to get int value of extractor items""" - if isinstance(o, list): - return map(lambda x: capa.helpers.oint(x), o) - else: - return capa.helpers.oint(o) + return capa.helpers.oint(o) def test_freeze_s_roundtrip(): From f424dd126fd61c28e951dc34e97069e08192102d Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sun, 16 Aug 2020 00:05:26 -0600 Subject: [PATCH 37/44] *: py3 compat --- capa/features/freeze.py | 11 ++++------- tests/fixtures.py | 18 +++++++++++++++--- tests/test_lancelot_features.py | 10 ---------- tests/test_main.py | 1 - 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/capa/features/freeze.py b/capa/features/freeze.py index 37148957..bf4f0b66 100644 --- a/capa/features/freeze.py +++ b/capa/features/freeze.py @@ -101,7 +101,9 @@ def dumps(extractor): for feature, va in extractor.extract_basic_block_features(f, bb): ret["scopes"]["basic block"].append(serialize_feature(feature) + (hex(va), (hex(f), hex(bb),))) - for insn, insnva in sorted([(insn, int(insn)) for insn in extractor.get_instructions(f, bb)]): + for insnva, insn in sorted( + [(insn.__int__(), insn) for insn in extractor.get_instructions(f, bb)], key=lambda p: p[0] + ): ret["functions"][hex(f)][hex(bb)].append(hex(insnva)) for feature, va in extractor.extract_insn_features(f, bb, insn): @@ -245,12 +247,7 @@ def main(argv=None): logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) - vw = capa.main.get_workspace(args.sample, args.format) - - # don't import this at top level to support ida/py3 backend - import capa.features.extractors.viv - - extractor = capa.features.extractors.viv.VivisectFeatureExtractor(vw, args.sample) + extractor = capa.main.get_extractor(args.sample, args.format) with open(args.output, "wb") as f: f.write(dump(extractor)) diff --git a/tests/fixtures.py b/tests/fixtures.py index 50a766d8..a1812b15 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,6 +7,7 @@ # See the License for the specific language governing permissions and limitations under the License. import os +import sys import os.path import collections @@ -40,6 +41,16 @@ def get_viv_extractor(path): return capa.features.extractors.viv.VivisectFeatureExtractor(vw, path) +@lru_cache +def get_lancelot_extractor(path): + import capa.features.extractors.lancelot + + with open(path, "rb") as f: + buf = f.read() + + return capa.features.extractors.lancelot.LancelotFeatureExtractor(buf) + + @lru_cache() def extract_file_features(extractor): features = collections.defaultdict(set) @@ -386,9 +397,10 @@ def do_test_feature_count(get_extractor, sample, scope, feature, expected): def get_extractor(path): - # decide here which extractor to load for tests. - # maybe check which python version we've loaded or if we're in IDA. - extractor = get_viv_extractor(path) + if sys.version_info >= (3, 0): + extractor = get_lancelot_extractor(path) + else: + extractor = get_viv_extractor(path) # overload the extractor so that the fixture exposes `extractor.path` setattr(extractor, "path", path) diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index 96db0d83..624a63e5 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -10,16 +10,6 @@ from fixtures import * -@lru_cache -def get_lancelot_extractor(path): - import capa.features.extractors.lancelot - - with open(path, "rb") as f: - buf = f.read() - - return capa.features.extractors.lancelot.LancelotFeatureExtractor(buf) - - @parametrize( "sample,scope,feature,expected", FEATURE_PRESENCE_TESTS, indirect=["sample", "scope"], ) diff --git a/tests/test_main.py b/tests/test_main.py index e9cb3d00..9125c123 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -14,7 +14,6 @@ import capa.main import capa.rules import capa.engine import capa.features -import capa.features.extractors.viv from capa.engine import * From 6ba479882284ca44367a73341e00758ddb01625a Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sun, 16 Aug 2020 01:09:06 -0600 Subject: [PATCH 38/44] tests: fixtures: add ctxmgr for catching xfail --- tests/fixtures.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/fixtures.py b/tests/fixtures.py index a1812b15..dee13889 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -9,6 +9,7 @@ import os import sys import os.path +import contextlib import collections import pytest @@ -28,6 +29,44 @@ except ImportError: CD = os.path.dirname(__file__) +@contextlib.contextmanager +def xfail(condition, reason=None): + """ + context manager that wraps a block that is expected to fail in some cases. + when it does fail (and is expected), then mark this as pytest.xfail. + if its unexpected, raise an exception, so the test fails. + + example:: + + # this test: + # - passes on py3 if foo() works + # - fails on py3 if foo() fails + # - xfails on py2 if foo() fails + # - fails on py2 if foo() works + with xfail(sys.version_info < (3, 0), reason="py3 doesn't foo"): + foo() + """ + try: + # do the block + yield + except: + if condition: + # we expected the test to fail, so raise and register this via pytest + pytest.xfail(reason) + else: + # we don't expect an exception, so the test should fail + raise + else: + if not condition: + # here we expect the block to run successfully, + # and we've received no exception, + # so this is good + pass + else: + # we expected an exception, but didn't find one. that's an error. + raise RuntimeError("expected to fail, but didn't") + + @lru_cache() def get_viv_extractor(path): import capa.features.extractors.viv From 1fe945e3edae0fdf8e7fa685804810c888b275dc Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sun, 16 Aug 2020 01:09:40 -0600 Subject: [PATCH 39/44] tests: main: xfail sc analysis on py3 --- tests/test_main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_main.py b/tests/test_main.py index 9125c123..bd1b0458 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -8,6 +8,7 @@ import textwrap +import pytest from fixtures import * import capa.main @@ -44,6 +45,7 @@ def test_main_single_rule(z9324d_extractor, tmpdir): assert capa.main.main([path, "-v", "-r", rule_file.strpath,]) == 0 +@pytest.mark.xfail(sys.version_info >= (3, 0), reason="lancelot doesn't support shellcode workspaces") def test_main_shellcode(z499c2_extractor): path = z499c2_extractor.path assert capa.main.main([path, "-vv", "-f", "sc32"]) == 0 From bb9803fcc05a0c9592dad59108d35b2a4abb62cf Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sun, 16 Aug 2020 01:10:10 -0600 Subject: [PATCH 40/44] tests: limit tests to py2/py3 --- tests/test_lancelot_features.py | 6 ++++-- tests/test_viv_features.py | 7 +++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index 624a63e5..4c78a818 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -14,11 +14,13 @@ from fixtures import * "sample,scope,feature,expected", FEATURE_PRESENCE_TESTS, indirect=["sample", "scope"], ) def test_lancelot_features(sample, scope, feature, expected): - do_test_feature_presence(get_lancelot_extractor, sample, scope, feature, expected) + with xfail(sys.version_info < (3, 0), reason="lancelot only works on py3"): + do_test_feature_presence(get_lancelot_extractor, sample, scope, feature, expected) @parametrize( "sample,scope,feature,expected", FEATURE_COUNT_TESTS, indirect=["sample", "scope"], ) def test_lancelot_feature_counts(sample, scope, feature, expected): - do_test_feature_count(get_lancelot_extractor, sample, scope, feature, expected) + with xfail(sys.version_info < (3, 0), reason="lancelot only works on py3"): + do_test_feature_count(get_lancelot_extractor, sample, scope, feature, expected) diff --git a/tests/test_viv_features.py b/tests/test_viv_features.py index ed466d45..e24687e9 100644 --- a/tests/test_viv_features.py +++ b/tests/test_viv_features.py @@ -5,6 +5,7 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import sys from fixtures import * @@ -13,11 +14,13 @@ from fixtures import * "sample,scope,feature,expected", FEATURE_PRESENCE_TESTS, indirect=["sample", "scope"], ) def test_viv_features(sample, scope, feature, expected): - do_test_feature_presence(get_viv_extractor, sample, scope, feature, expected) + with xfail(sys.version_info >= (3, 0), reason="vivsect only works on py2"): + do_test_feature_presence(get_viv_extractor, sample, scope, feature, expected) @parametrize( "sample,scope,feature,expected", FEATURE_COUNT_TESTS, indirect=["sample", "scope"], ) def test_viv_feature_counts(sample, scope, feature, expected): - do_test_feature_count(get_viv_extractor, sample, scope, feature, expected) + with xfail(sys.version_info >= (3, 0), reason="vivsect only works on py2"): + do_test_feature_count(get_viv_extractor, sample, scope, feature, expected) From ca95512811b723585af402ddb62d6f0ddc38e9b1 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sun, 16 Aug 2020 01:36:25 -0600 Subject: [PATCH 41/44] ci: test on both py2 and py3 --- .github/workflows/tests.yml | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d6cdd802..d204efdc 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -40,7 +40,7 @@ jobs: - name: Run rule linter run: python scripts/lint.py rules/ - tests: + tests27: runs-on: ubuntu-latest needs: [code_style, rule_linter] steps: @@ -57,3 +57,20 @@ jobs: - name: Run tests run: pytest tests/ + tests38: + runs-on: ubuntu-latest + needs: [code_style, rule_linter] + steps: + - name: Checkout capa with submodules + uses: actions/checkout@v2 + with: + submodules: true + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Install capa + run: pip install -e .[dev] + - name: Run tests + run: pytest tests/ + From 0cfbed05b4b87302d98bc0061de93813f6de3e3b Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sun, 16 Aug 2020 01:41:25 -0600 Subject: [PATCH 42/44] ci: install pefile on py3 --- .github/workflows/tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d204efdc..152a8ba6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -70,7 +70,8 @@ jobs: with: python-version: 3.8 - name: Install capa - run: pip install -e .[dev] + # TODO: remove `pefile` when we bump lancelot >= 0.3.7 + run: pip install -e .[dev] pefile - name: Run tests run: pytest tests/ From df4c75882d8718c6fa07189878b6817f5422e4d2 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Sun, 16 Aug 2020 04:58:35 -0600 Subject: [PATCH 43/44] dos2unix --- capa/features/extractors/lancelot/__init__.py | 184 ++-- .../extractors/lancelot/basicblock.py | 240 ++--- capa/features/extractors/lancelot/file.py | 162 +-- capa/features/extractors/lancelot/function.py | 128 +-- capa/features/extractors/lancelot/helpers.py | 66 +- .../extractors/lancelot/indirect_calls.py | 298 +++--- capa/features/extractors/lancelot/insn.py | 974 +++++++++--------- rules | 2 +- scripts/import-to-bn.py | 224 ++-- scripts/import-to-ida.py | 234 ++--- tests/test_ida_features.py | 208 ++-- tests/test_lancelot_features.py | 52 +- 12 files changed, 1386 insertions(+), 1386 deletions(-) diff --git a/capa/features/extractors/lancelot/__init__.py b/capa/features/extractors/lancelot/__init__.py index b8dbe104..e22acade 100644 --- a/capa/features/extractors/lancelot/__init__.py +++ b/capa/features/extractors/lancelot/__init__.py @@ -1,92 +1,92 @@ -# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: [package root]/LICENSE.txt -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and limitations under the License. -import logging - -import lancelot - -import capa.features.extractors -import capa.features.extractors.lancelot.file -import capa.features.extractors.lancelot.insn -import capa.features.extractors.lancelot.function -import capa.features.extractors.lancelot.basicblock - -__all__ = ["file", "function", "basicblock", "insn"] -logger = logging.getLogger(__name__) - - -class BB(object): - """extend the lancelot.BasicBlock with an __int__ method to access the address""" - - def __init__(self, ws, bb): - super(BB, self).__init__() - self.ws = ws - self.address = bb.address - self.length = bb.length - self.predecessors = bb.predecessors - self.successors = bb.successors - - def __int__(self): - return self.address - - @property - def instructions(self): - va = self.address - while va < self.address + self.length: - try: - insn = self.ws.read_insn(va) - except ValueError: - logger.warning("failed to read instruction at 0x%x", va) - return - - yield insn - va += insn.length - - -class LancelotFeatureExtractor(capa.features.extractors.FeatureExtractor): - def __init__(self, buf): - super(LancelotFeatureExtractor, self).__init__() - self.buf = buf - self.ws = lancelot.from_bytes(buf) - self.ctx = {} - - def get_base_address(self): - return self.ws.base_address - - def extract_file_features(self): - for feature, va in capa.features.extractors.lancelot.file.extract_file_features(self.buf): - yield feature, va - - def get_functions(self): - for va in self.ws.get_functions(): - # this is just the address of the function - yield va - - def extract_function_features(self, f): - for feature, va in capa.features.extractors.lancelot.function.extract_function_features(self.ws, f): - yield feature, va - - def get_basic_blocks(self, f): - try: - cfg = self.ws.build_cfg(f) - except: - logger.warning("failed to build CFG for 0x%x", f) - return - else: - for bb in cfg.basic_blocks.values(): - yield BB(self.ws, bb) - - def extract_basic_block_features(self, f, bb): - for feature, va in capa.features.extractors.lancelot.basicblock.extract_basic_block_features(self.ws, bb): - yield feature, va - - def get_instructions(self, f, bb): - return bb.instructions - - def extract_insn_features(self, f, bb, insn): - for feature, va in capa.features.extractors.lancelot.insn.extract_insn_features(self, f, bb, insn): - yield feature, va +# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging + +import lancelot + +import capa.features.extractors +import capa.features.extractors.lancelot.file +import capa.features.extractors.lancelot.insn +import capa.features.extractors.lancelot.function +import capa.features.extractors.lancelot.basicblock + +__all__ = ["file", "function", "basicblock", "insn"] +logger = logging.getLogger(__name__) + + +class BB(object): + """extend the lancelot.BasicBlock with an __int__ method to access the address""" + + def __init__(self, ws, bb): + super(BB, self).__init__() + self.ws = ws + self.address = bb.address + self.length = bb.length + self.predecessors = bb.predecessors + self.successors = bb.successors + + def __int__(self): + return self.address + + @property + def instructions(self): + va = self.address + while va < self.address + self.length: + try: + insn = self.ws.read_insn(va) + except ValueError: + logger.warning("failed to read instruction at 0x%x", va) + return + + yield insn + va += insn.length + + +class LancelotFeatureExtractor(capa.features.extractors.FeatureExtractor): + def __init__(self, buf): + super(LancelotFeatureExtractor, self).__init__() + self.buf = buf + self.ws = lancelot.from_bytes(buf) + self.ctx = {} + + def get_base_address(self): + return self.ws.base_address + + def extract_file_features(self): + for feature, va in capa.features.extractors.lancelot.file.extract_file_features(self.buf): + yield feature, va + + def get_functions(self): + for va in self.ws.get_functions(): + # this is just the address of the function + yield va + + def extract_function_features(self, f): + for feature, va in capa.features.extractors.lancelot.function.extract_function_features(self.ws, f): + yield feature, va + + def get_basic_blocks(self, f): + try: + cfg = self.ws.build_cfg(f) + except: + logger.warning("failed to build CFG for 0x%x", f) + return + else: + for bb in cfg.basic_blocks.values(): + yield BB(self.ws, bb) + + def extract_basic_block_features(self, f, bb): + for feature, va in capa.features.extractors.lancelot.basicblock.extract_basic_block_features(self.ws, bb): + yield feature, va + + def get_instructions(self, f, bb): + return bb.instructions + + def extract_insn_features(self, f, bb, insn): + for feature, va in capa.features.extractors.lancelot.insn.extract_insn_features(self, f, bb, insn): + yield feature, va diff --git a/capa/features/extractors/lancelot/basicblock.py b/capa/features/extractors/lancelot/basicblock.py index bad58ff4..e1dc0e36 100644 --- a/capa/features/extractors/lancelot/basicblock.py +++ b/capa/features/extractors/lancelot/basicblock.py @@ -1,120 +1,120 @@ -import string -import struct -import logging - -from lancelot import ( - FLOW_VA, - OPERAND_SIZE, - OPERAND_TYPE, - MEMORY_OPERAND_BASE, - OPERAND_TYPE_MEMORY, - OPERAND_TYPE_IMMEDIATE, - IMMEDIATE_OPERAND_VALUE, -) - -from capa.features import Characteristic -from capa.features.basicblock import BasicBlock -from capa.features.extractors.helpers import MIN_STACKSTRING_LEN - -logger = logging.getLogger(__name__) - - -def extract_bb_tight_loop(ws, bb): - """ check basic block for tight loop indicators """ - if bb.address in map(lambda flow: flow[FLOW_VA], bb.successors): - yield Characteristic("tight loop"), bb.address - - -def is_mov_imm_to_stack(insn): - if not insn.mnemonic.startswith("mov"): - return False - - try: - dst, src = insn.operands - except ValueError: - # not two operands - return False - - if src[OPERAND_TYPE] != OPERAND_TYPE_IMMEDIATE: - return False - - if src[IMMEDIATE_OPERAND_VALUE] < 0: - return False - - if dst[OPERAND_TYPE] != OPERAND_TYPE_MEMORY: - return False - - if dst[MEMORY_OPERAND_BASE] not in ("ebp", "rbp", "esp", "rsp"): - return False - - return True - - -def is_printable_ascii(chars): - return all(c < 127 and chr(c) in string.printable for c in chars) - - -def is_printable_utf16le(chars): - if all(c == b"\x00" for c in chars[1::2]): - return is_printable_ascii(chars[::2]) - - -def get_printable_len(operand): - """ - Return string length if all operand bytes are ascii or utf16-le printable - """ - operand_size = operand[OPERAND_SIZE] - if operand_size == 8: - chars = struct.pack(" MIN_STACKSTRING_LEN: - return True - - return False - - -def extract_stackstring(ws, bb): - """ check basic block for stackstring indicators """ - if _bb_has_stackstring(ws, bb): - yield Characteristic("stack string"), bb.address - - -def extract_basic_block_features(ws, bb): - yield BasicBlock(), bb.address - for bb_handler in BASIC_BLOCK_HANDLERS: - for feature, va in bb_handler(ws, bb): - yield feature, va - - -BASIC_BLOCK_HANDLERS = ( - extract_bb_tight_loop, - extract_stackstring, -) +import string +import struct +import logging + +from lancelot import ( + FLOW_VA, + OPERAND_SIZE, + OPERAND_TYPE, + MEMORY_OPERAND_BASE, + OPERAND_TYPE_MEMORY, + OPERAND_TYPE_IMMEDIATE, + IMMEDIATE_OPERAND_VALUE, +) + +from capa.features import Characteristic +from capa.features.basicblock import BasicBlock +from capa.features.extractors.helpers import MIN_STACKSTRING_LEN + +logger = logging.getLogger(__name__) + + +def extract_bb_tight_loop(ws, bb): + """ check basic block for tight loop indicators """ + if bb.address in map(lambda flow: flow[FLOW_VA], bb.successors): + yield Characteristic("tight loop"), bb.address + + +def is_mov_imm_to_stack(insn): + if not insn.mnemonic.startswith("mov"): + return False + + try: + dst, src = insn.operands + except ValueError: + # not two operands + return False + + if src[OPERAND_TYPE] != OPERAND_TYPE_IMMEDIATE: + return False + + if src[IMMEDIATE_OPERAND_VALUE] < 0: + return False + + if dst[OPERAND_TYPE] != OPERAND_TYPE_MEMORY: + return False + + if dst[MEMORY_OPERAND_BASE] not in ("ebp", "rbp", "esp", "rsp"): + return False + + return True + + +def is_printable_ascii(chars): + return all(c < 127 and chr(c) in string.printable for c in chars) + + +def is_printable_utf16le(chars): + if all(c == b"\x00" for c in chars[1::2]): + return is_printable_ascii(chars[::2]) + + +def get_printable_len(operand): + """ + Return string length if all operand bytes are ascii or utf16-le printable + """ + operand_size = operand[OPERAND_SIZE] + if operand_size == 8: + chars = struct.pack(" MIN_STACKSTRING_LEN: + return True + + return False + + +def extract_stackstring(ws, bb): + """ check basic block for stackstring indicators """ + if _bb_has_stackstring(ws, bb): + yield Characteristic("stack string"), bb.address + + +def extract_basic_block_features(ws, bb): + yield BasicBlock(), bb.address + for bb_handler in BASIC_BLOCK_HANDLERS: + for feature, va in bb_handler(ws, bb): + yield feature, va + + +BASIC_BLOCK_HANDLERS = ( + extract_bb_tight_loop, + extract_stackstring, +) diff --git a/capa/features/extractors/lancelot/file.py b/capa/features/extractors/lancelot/file.py index 913b69fc..73bc8243 100644 --- a/capa/features/extractors/lancelot/file.py +++ b/capa/features/extractors/lancelot/file.py @@ -1,81 +1,81 @@ -import pefile - -import capa.features.extractors.strings -from capa.features import String, Characteristic -from capa.features.file import Export, Import, Section - - -def extract_file_embedded_pe(buf, pe): - buf = buf[2:] - - total_offset = 2 - while True: - try: - offset = buf.index(b"MZ") - except ValueError: - return - else: - rest = buf[offset:] - total_offset += offset - - try: - _ = pefile.PE(data=rest) - except: - pass - else: - yield Characteristic("embedded pe"), total_offset - - buf = rest[2:] - total_offset += 2 - - -def extract_file_export_names(buf, pe): - if not hasattr(pe, "DIRECTORY_ENTRY_EXPORT"): - return - - base_address = pe.OPTIONAL_HEADER.ImageBase - for exp in pe.DIRECTORY_ENTRY_EXPORT.symbols: - yield Export(exp.name.decode("ascii")), base_address + exp.address - - -def extract_file_import_names(buf, pe): - base_address = pe.OPTIONAL_HEADER.ImageBase - for entry in pe.DIRECTORY_ENTRY_IMPORT: - libname = entry.dll.decode("ascii").lower().partition(".")[0] - for imp in entry.imports: - if imp.ordinal: - yield Import("%s.#%s" % (libname, imp.ordinal)), imp.address - else: - impname = imp.name.decode("ascii") - yield Import("%s.%s" % (libname, impname)), imp.address - yield Import("%s" % (impname)), imp.address - - -def extract_file_section_names(buf, pe): - base_address = pe.OPTIONAL_HEADER.ImageBase - for section in pe.sections: - yield Section(section.Name.partition(b"\x00")[0].decode("ascii")), base_address + section.VirtualAddress - - -def extract_file_strings(buf, pe): - for s in capa.features.extractors.strings.extract_ascii_strings(buf): - yield String(s.s), s.offset - - for s in capa.features.extractors.strings.extract_unicode_strings(buf): - yield String(s.s), s.offset - - -def extract_file_features(buf): - pe = pefile.PE(data=buf) - for file_handler in FILE_HANDLERS: - for feature, va in file_handler(buf, pe): - yield feature, va - - -FILE_HANDLERS = ( - extract_file_embedded_pe, - extract_file_export_names, - extract_file_import_names, - extract_file_section_names, - extract_file_strings, -) +import pefile + +import capa.features.extractors.strings +from capa.features import String, Characteristic +from capa.features.file import Export, Import, Section + + +def extract_file_embedded_pe(buf, pe): + buf = buf[2:] + + total_offset = 2 + while True: + try: + offset = buf.index(b"MZ") + except ValueError: + return + else: + rest = buf[offset:] + total_offset += offset + + try: + _ = pefile.PE(data=rest) + except: + pass + else: + yield Characteristic("embedded pe"), total_offset + + buf = rest[2:] + total_offset += 2 + + +def extract_file_export_names(buf, pe): + if not hasattr(pe, "DIRECTORY_ENTRY_EXPORT"): + return + + base_address = pe.OPTIONAL_HEADER.ImageBase + for exp in pe.DIRECTORY_ENTRY_EXPORT.symbols: + yield Export(exp.name.decode("ascii")), base_address + exp.address + + +def extract_file_import_names(buf, pe): + base_address = pe.OPTIONAL_HEADER.ImageBase + for entry in pe.DIRECTORY_ENTRY_IMPORT: + libname = entry.dll.decode("ascii").lower().partition(".")[0] + for imp in entry.imports: + if imp.ordinal: + yield Import("%s.#%s" % (libname, imp.ordinal)), imp.address + else: + impname = imp.name.decode("ascii") + yield Import("%s.%s" % (libname, impname)), imp.address + yield Import("%s" % (impname)), imp.address + + +def extract_file_section_names(buf, pe): + base_address = pe.OPTIONAL_HEADER.ImageBase + for section in pe.sections: + yield Section(section.Name.partition(b"\x00")[0].decode("ascii")), base_address + section.VirtualAddress + + +def extract_file_strings(buf, pe): + for s in capa.features.extractors.strings.extract_ascii_strings(buf): + yield String(s.s), s.offset + + for s in capa.features.extractors.strings.extract_unicode_strings(buf): + yield String(s.s), s.offset + + +def extract_file_features(buf): + pe = pefile.PE(data=buf) + for file_handler in FILE_HANDLERS: + for feature, va in file_handler(buf, pe): + yield feature, va + + +FILE_HANDLERS = ( + extract_file_embedded_pe, + extract_file_export_names, + extract_file_import_names, + extract_file_section_names, + extract_file_strings, +) diff --git a/capa/features/extractors/lancelot/function.py b/capa/features/extractors/lancelot/function.py index e8c8cc2c..b885f392 100644 --- a/capa/features/extractors/lancelot/function.py +++ b/capa/features/extractors/lancelot/function.py @@ -1,64 +1,64 @@ -import logging - -try: - from functools import lru_cache -except ImportError: - from backports.functools_lru_cache import lru_cache - -from lancelot import ( - FLOW_VA, - FLOW_TYPE, - FLOW_TYPE_CONDITIONAL_JUMP, - FLOW_TYPE_CONDITIONAL_MOVE, - FLOW_TYPE_UNCONDITIONAL_JUMP, -) - -from capa.features import Characteristic -from capa.features.extractors import loops - -logger = logging.getLogger(__name__) - - -@lru_cache -def get_call_graph(ws): - return ws.build_call_graph() - - -def extract_function_calls_to(ws, f): - cg = get_call_graph(ws) - - for caller in cg.calls_to.get(f, []): - yield Characteristic("calls to"), caller - - -def extract_function_loop(ws, f): - edges = [] - for bb in ws.build_cfg(f).basic_blocks.values(): - for flow in bb.successors: - if flow[FLOW_TYPE] in ( - FLOW_TYPE_UNCONDITIONAL_JUMP, - FLOW_TYPE_CONDITIONAL_JUMP, - FLOW_TYPE_CONDITIONAL_MOVE, - ): - edges.append((bb.address, flow[FLOW_VA])) - continue - - if edges and loops.has_loop(edges): - yield Characteristic("loop"), f - - -FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop) - - -_not_implemented = set([]) - - -def extract_function_features(ws, f): - for func_handler in FUNCTION_HANDLERS: - try: - for feature, va in func_handler(ws, f): - yield feature, va - except NotImplementedError: - if func_handler.__name__ not in _not_implemented: - logger.warning("not implemented: %s", func_handler.__name__) - _not_implemented.add(func_handler.__name__) +import logging + +try: + from functools import lru_cache +except ImportError: + from backports.functools_lru_cache import lru_cache + +from lancelot import ( + FLOW_VA, + FLOW_TYPE, + FLOW_TYPE_CONDITIONAL_JUMP, + FLOW_TYPE_CONDITIONAL_MOVE, + FLOW_TYPE_UNCONDITIONAL_JUMP, +) + +from capa.features import Characteristic +from capa.features.extractors import loops + +logger = logging.getLogger(__name__) + + +@lru_cache +def get_call_graph(ws): + return ws.build_call_graph() + + +def extract_function_calls_to(ws, f): + cg = get_call_graph(ws) + + for caller in cg.calls_to.get(f, []): + yield Characteristic("calls to"), caller + + +def extract_function_loop(ws, f): + edges = [] + for bb in ws.build_cfg(f).basic_blocks.values(): + for flow in bb.successors: + if flow[FLOW_TYPE] in ( + FLOW_TYPE_UNCONDITIONAL_JUMP, + FLOW_TYPE_CONDITIONAL_JUMP, + FLOW_TYPE_CONDITIONAL_MOVE, + ): + edges.append((bb.address, flow[FLOW_VA])) + continue + + if edges and loops.has_loop(edges): + yield Characteristic("loop"), f + + +FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop) + + +_not_implemented = set([]) + + +def extract_function_features(ws, f): + for func_handler in FUNCTION_HANDLERS: + try: + for feature, va in func_handler(ws, f): + yield feature, va + except NotImplementedError: + if func_handler.__name__ not in _not_implemented: + logger.warning("not implemented: %s", func_handler.__name__) + _not_implemented.add(func_handler.__name__) diff --git a/capa/features/extractors/lancelot/helpers.py b/capa/features/extractors/lancelot/helpers.py index 0e755104..fcf6c037 100644 --- a/capa/features/extractors/lancelot/helpers.py +++ b/capa/features/extractors/lancelot/helpers.py @@ -1,33 +1,33 @@ -from lancelot import ( - OPERAND_TYPE, - MEMORY_OPERAND_BASE, - MEMORY_OPERAND_DISP, - OPERAND_TYPE_MEMORY, - OPERAND_TYPE_IMMEDIATE, - IMMEDIATE_OPERAND_VALUE, - IMMEDIATE_OPERAND_IS_RELATIVE, -) - - -def get_operand_target(insn, op): - if op[OPERAND_TYPE] == OPERAND_TYPE_MEMORY: - # call direct, x64 - # rip relative - # kernel32-64:180001041 call cs:__imp_RtlVirtualUnwind_0 - if op[MEMORY_OPERAND_BASE] == "rip": - return op[MEMORY_OPERAND_DISP] + insn.address + insn.length - - # call direct, x32 - # mimikatz:0x403BD3 call ds:CryptAcquireContextW - elif op[MEMORY_OPERAND_BASE] == None: - return op[MEMORY_OPERAND_DISP] - - # call via thunk - # mimikatz:0x455A41 call LsaQueryInformationPolicy - elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE and op[IMMEDIATE_OPERAND_IS_RELATIVE]: - return op[IMMEDIATE_OPERAND_VALUE] + insn.address + insn.length - - elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE: - return op[IMMEDIATE_OPERAND_VALUE] - - raise ValueError("memory operand has no target") +from lancelot import ( + OPERAND_TYPE, + MEMORY_OPERAND_BASE, + MEMORY_OPERAND_DISP, + OPERAND_TYPE_MEMORY, + OPERAND_TYPE_IMMEDIATE, + IMMEDIATE_OPERAND_VALUE, + IMMEDIATE_OPERAND_IS_RELATIVE, +) + + +def get_operand_target(insn, op): + if op[OPERAND_TYPE] == OPERAND_TYPE_MEMORY: + # call direct, x64 + # rip relative + # kernel32-64:180001041 call cs:__imp_RtlVirtualUnwind_0 + if op[MEMORY_OPERAND_BASE] == "rip": + return op[MEMORY_OPERAND_DISP] + insn.address + insn.length + + # call direct, x32 + # mimikatz:0x403BD3 call ds:CryptAcquireContextW + elif op[MEMORY_OPERAND_BASE] == None: + return op[MEMORY_OPERAND_DISP] + + # call via thunk + # mimikatz:0x455A41 call LsaQueryInformationPolicy + elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE and op[IMMEDIATE_OPERAND_IS_RELATIVE]: + return op[IMMEDIATE_OPERAND_VALUE] + insn.address + insn.length + + elif op[OPERAND_TYPE] == OPERAND_TYPE_IMMEDIATE: + return op[IMMEDIATE_OPERAND_VALUE] + + raise ValueError("memory operand has no target") diff --git a/capa/features/extractors/lancelot/indirect_calls.py b/capa/features/extractors/lancelot/indirect_calls.py index e1318873..f6f376b0 100644 --- a/capa/features/extractors/lancelot/indirect_calls.py +++ b/capa/features/extractors/lancelot/indirect_calls.py @@ -1,149 +1,149 @@ -# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: [package root]/LICENSE.txt -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and limitations under the License. - -import collections - -from lancelot import ( - FLOW_VA, - OPERAND_TYPE, - PERMISSION_READ, - MEMORY_OPERAND_BASE, - MEMORY_OPERAND_DISP, - OPERAND_TYPE_MEMORY, - MEMORY_OPERAND_INDEX, - OPERAND_TYPE_REGISTER, - MEMORY_OPERAND_SEGMENT, - OPERAND_TYPE_IMMEDIATE, - IMMEDIATE_OPERAND_VALUE, - REGISTER_OPERAND_REGISTER, - IMMEDIATE_OPERAND_IS_RELATIVE, -) - -from capa.features.extractors.lancelot.helpers import get_operand_target - -DESTRUCTIVE_MNEMONICS = ("mov", "lea", "pop", "xor") - - -class NotFoundError(Exception): - pass - - -def read_instructions(ws, bb): - va = bb.address - while va < bb.address + bb.length: - try: - insn = ws.read_insn(va) - except ValueError: - return - - yield insn - va += insn.length - - -def build_instruction_predecessors(ws, cfg): - preds = collections.defaultdict(set) - - for bb in cfg.basic_blocks.values(): - insns = list(read_instructions(ws, bb)) - - for i, insn in enumerate(insns): - if i == 0: - for pred in bb.predecessors: - pred_bb = cfg.basic_blocks[pred[FLOW_VA]] - preds[insn.address].add(list(read_instructions(ws, pred_bb))[-1].address) - else: - preds[insn.address].add(insns[i - 1].address) - - return preds - - -def find_definition(ws, f, insn): - """ - scan backwards from the given address looking for assignments to the given register. - if a constant, return that value. - args: - ws (lancelot.PE) - f (int): the function start address - insn (lancelot.Instruction): call instruction to resolve - returns: - (va: int, value?: int|None): the address of the assignment and the value, if a constant. - raises: - NotFoundError: when the definition cannot be found. - """ - assert insn.mnemonic == "call" - op0 = insn.operands[0] - assert op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER - reg = op0[REGISTER_OPERAND_REGISTER] - - cfg = ws.build_cfg(f) - preds = build_instruction_predecessors(ws, cfg) - - q = collections.deque() - seen = set([]) - q.extend(preds[insn.address]) - while q: - cur = q.popleft() - - # skip if we've already processed this location - if cur in seen: - continue - seen.add(cur) - - insn = ws.read_insn(cur) - operands = insn.operands - - if len(operands) == 0: - q.extend(preds[cur]) - continue - - op0 = operands[0] - if not ( - op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER - and op0[REGISTER_OPERAND_REGISTER] == reg - and insn.mnemonic in DESTRUCTIVE_MNEMONICS - ): - q.extend(preds[cur]) - continue - - # if we reach here, the instruction is destructive to our target register. - - # we currently only support extracting the constant from something like: `mov $reg, IAT` - # so, any other pattern results in an unknown value, represented by None. - # this is a good place to extend in the future, if we need more robust support. - if insn.mnemonic != "mov": - return (cur, None) - else: - op1 = operands[1] - try: - target = get_operand_target(insn, op1) - except ValueError: - return (cur, None) - else: - return (cur, target) - - raise NotFoundError() - - -def is_indirect_call(insn): - return insn.mnemonic == "call" and insn.operands[0][OPERAND_TYPE] == OPERAND_TYPE_REGISTER - - -def resolve_indirect_call(ws, f, insn): - """ - inspect the given indirect call instruction and attempt to resolve the target address. - args: - ws (lancelot.PE): the analysis workspace - f (int): the address of the function to analyze - insn (lancelot.Instruction): the instruction at which to start analysis - returns: - (va: int, value?: int|None): the address of the assignment and the value, if a constant. - raises: - NotFoundError: when the definition cannot be found. - """ - assert is_indirect_call(insn) - return find_definition(ws, f, insn) +# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import collections + +from lancelot import ( + FLOW_VA, + OPERAND_TYPE, + PERMISSION_READ, + MEMORY_OPERAND_BASE, + MEMORY_OPERAND_DISP, + OPERAND_TYPE_MEMORY, + MEMORY_OPERAND_INDEX, + OPERAND_TYPE_REGISTER, + MEMORY_OPERAND_SEGMENT, + OPERAND_TYPE_IMMEDIATE, + IMMEDIATE_OPERAND_VALUE, + REGISTER_OPERAND_REGISTER, + IMMEDIATE_OPERAND_IS_RELATIVE, +) + +from capa.features.extractors.lancelot.helpers import get_operand_target + +DESTRUCTIVE_MNEMONICS = ("mov", "lea", "pop", "xor") + + +class NotFoundError(Exception): + pass + + +def read_instructions(ws, bb): + va = bb.address + while va < bb.address + bb.length: + try: + insn = ws.read_insn(va) + except ValueError: + return + + yield insn + va += insn.length + + +def build_instruction_predecessors(ws, cfg): + preds = collections.defaultdict(set) + + for bb in cfg.basic_blocks.values(): + insns = list(read_instructions(ws, bb)) + + for i, insn in enumerate(insns): + if i == 0: + for pred in bb.predecessors: + pred_bb = cfg.basic_blocks[pred[FLOW_VA]] + preds[insn.address].add(list(read_instructions(ws, pred_bb))[-1].address) + else: + preds[insn.address].add(insns[i - 1].address) + + return preds + + +def find_definition(ws, f, insn): + """ + scan backwards from the given address looking for assignments to the given register. + if a constant, return that value. + args: + ws (lancelot.PE) + f (int): the function start address + insn (lancelot.Instruction): call instruction to resolve + returns: + (va: int, value?: int|None): the address of the assignment and the value, if a constant. + raises: + NotFoundError: when the definition cannot be found. + """ + assert insn.mnemonic == "call" + op0 = insn.operands[0] + assert op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER + reg = op0[REGISTER_OPERAND_REGISTER] + + cfg = ws.build_cfg(f) + preds = build_instruction_predecessors(ws, cfg) + + q = collections.deque() + seen = set([]) + q.extend(preds[insn.address]) + while q: + cur = q.popleft() + + # skip if we've already processed this location + if cur in seen: + continue + seen.add(cur) + + insn = ws.read_insn(cur) + operands = insn.operands + + if len(operands) == 0: + q.extend(preds[cur]) + continue + + op0 = operands[0] + if not ( + op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER + and op0[REGISTER_OPERAND_REGISTER] == reg + and insn.mnemonic in DESTRUCTIVE_MNEMONICS + ): + q.extend(preds[cur]) + continue + + # if we reach here, the instruction is destructive to our target register. + + # we currently only support extracting the constant from something like: `mov $reg, IAT` + # so, any other pattern results in an unknown value, represented by None. + # this is a good place to extend in the future, if we need more robust support. + if insn.mnemonic != "mov": + return (cur, None) + else: + op1 = operands[1] + try: + target = get_operand_target(insn, op1) + except ValueError: + return (cur, None) + else: + return (cur, target) + + raise NotFoundError() + + +def is_indirect_call(insn): + return insn.mnemonic == "call" and insn.operands[0][OPERAND_TYPE] == OPERAND_TYPE_REGISTER + + +def resolve_indirect_call(ws, f, insn): + """ + inspect the given indirect call instruction and attempt to resolve the target address. + args: + ws (lancelot.PE): the analysis workspace + f (int): the address of the function to analyze + insn (lancelot.Instruction): the instruction at which to start analysis + returns: + (va: int, value?: int|None): the address of the assignment and the value, if a constant. + raises: + NotFoundError: when the definition cannot be found. + """ + assert is_indirect_call(insn) + return find_definition(ws, f, insn) diff --git a/capa/features/extractors/lancelot/insn.py b/capa/features/extractors/lancelot/insn.py index 854ecb0d..d658bf79 100644 --- a/capa/features/extractors/lancelot/insn.py +++ b/capa/features/extractors/lancelot/insn.py @@ -1,487 +1,487 @@ -import logging -import itertools - -import pefile - -try: - from functools import lru_cache -except ImportError: - from backports.functools_lru_cache import lru_cache - -from lancelot import ( - OPERAND_TYPE, - PERMISSION_READ, - MEMORY_OPERAND_BASE, - MEMORY_OPERAND_DISP, - OPERAND_TYPE_MEMORY, - MEMORY_OPERAND_INDEX, - OPERAND_TYPE_REGISTER, - MEMORY_OPERAND_SEGMENT, - OPERAND_TYPE_IMMEDIATE, - IMMEDIATE_OPERAND_VALUE, - REGISTER_OPERAND_REGISTER, - IMMEDIATE_OPERAND_IS_RELATIVE, -) - -import capa.features.extractors.helpers -from capa.features import ARCH_X32, ARCH_X64, MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic -from capa.features.insn import Number, Offset, Mnemonic -from capa.features.extractors.lancelot.helpers import get_operand_target -from capa.features.extractors.lancelot.function import get_call_graph -from capa.features.extractors.lancelot.indirect_calls import NotFoundError, resolve_indirect_call - -logger = logging.getLogger(__name__) - - -# security cookie checks may perform non-zeroing XORs, these are expected within a certain -# byte range within the first and returning basic blocks, this helps to reduce FP features -SECURITY_COOKIE_BYTES_DELTA = 0x40 - - -def get_arch(ws): - if ws.arch == "x32": - return ARCH_X32 - elif ws.arch == "x64": - return ARCH_X64 - else: - raise ValueError("unexpected architecture") - - -@lru_cache -def get_pefile(xtor): - return pefile.PE(data=xtor.buf) - - -@lru_cache -def get_imports(xtor): - pe = get_pefile(xtor) - - imports = {} - for entry in pe.DIRECTORY_ENTRY_IMPORT: - libname = entry.dll.decode("ascii").lower().partition(".")[0] - for imp in entry.imports: - if imp.ordinal: - imports[imp.address] = "%s.#%s" % (libname, imp.ordinal) - else: - impname = imp.name.decode("ascii") - imports[imp.address] = "%s.%s" % (libname, impname) - return imports - - -@lru_cache -def get_thunks(xtor): - thunks = {} - for va in xtor.ws.get_functions(): - try: - insn = xtor.ws.read_insn(va) - except ValueError: - continue - - if insn.mnemonic != "jmp": - continue - - op0 = insn.operands[0] - - try: - target = get_operand_target(insn, op0) - except ValueError: - continue - - imports = get_imports(xtor) - if target not in imports: - continue - - thunks[va] = imports[target] - - return thunks - - -def extract_insn_api_features(xtor, f, bb, insn): - """parse API features from the given instruction.""" - - if insn.mnemonic != "call": - return - - op0 = insn.operands[0] - - if op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER: - try: - (_, target) = resolve_indirect_call(xtor.ws, f, insn) - except NotFoundError: - return - if target is None: - return - else: - try: - target = get_operand_target(insn, op0) - except ValueError: - return - - imports = get_imports(xtor) - if target in imports: - for feature, va in capa.features.extractors.helpers.generate_api_features(imports[target], insn.address): - yield feature, va - return - - thunks = get_thunks(xtor) - if target in thunks: - for feature, va in capa.features.extractors.helpers.generate_api_features(thunks[target], insn.address): - yield feature, va - - -def extract_insn_mnemonic_features(xtor, f, bb, insn): - """parse mnemonic features from the given instruction.""" - yield Mnemonic(insn.mnemonic), insn.address - - -def extract_insn_number_features(xtor, f, bb, insn): - """parse number features from the given instruction.""" - operands = insn.operands - - for operand in operands: - if operand[OPERAND_TYPE] != OPERAND_TYPE_IMMEDIATE: - continue - - v = operand[IMMEDIATE_OPERAND_VALUE] - - if xtor.ws.probe(v) & PERMISSION_READ: - # v is a valid address - # therefore, assume its not also a constant. - continue - - if ( - insn.mnemonic == "add" - and operands[0][OPERAND_TYPE] == OPERAND_TYPE_REGISTER - and operands[0][REGISTER_OPERAND_REGISTER] == "esp" - ): - # skip things like: - # - # .text:00401140 call sub_407E2B - # .text:00401145 add esp, 0Ch - return - - yield Number(v), insn.address - yield Number(v, arch=get_arch(xtor.ws)), insn.address - - -def extract_insn_offset_features(xtor, f, bb, insn): - """parse structure offset features from the given instruction.""" - operands = insn.operands - - for operand in operands: - if operand[OPERAND_TYPE] != OPERAND_TYPE_MEMORY: - continue - - if operand[MEMORY_OPERAND_BASE] in ("esp", "ebp", "rbp"): - continue - - # lancelot provides `None` when the displacement is not present. - v = operand[MEMORY_OPERAND_DISP] or 0 - - yield Offset(v), insn.address - yield Offset(v, arch=get_arch(xtor.ws)), insn.address - - -def derefs(xtor, p): - """ - recursively follow the given pointer, yielding the valid memory addresses along the way. - useful when you may have a pointer to string, or pointer to pointer to string, etc. - this is a "do what i mean" type of helper function. - """ - - depth = 0 - while True: - if not xtor.ws.probe(p) & PERMISSION_READ: - return - yield p - - next = xtor.ws.read_pointer(p) - - # sanity: pointer points to self - if next == p: - return - - # sanity: avoid chains of pointers that are unreasonably deep - depth += 1 - if depth > 10: - return - - p = next - - -def read_bytes(xtor, va): - """ - read up to MAX_BYTES_FEATURE_SIZE from the given address. - - raises: - ValueError: if the given address is not valid. - """ - start = va - end = va + MAX_BYTES_FEATURE_SIZE - pe = get_pefile(xtor) - - for section in pe.sections: - section_start = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress - section_end = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress + section.Misc_VirtualSize - - if section_start <= start < section_end: - end = min(end, section_end) - return xtor.ws.read_bytes(start, end - start) - - raise ValueError("invalid address") - - -# these are mnemonics that may flow (jump) elsewhere -FLOW_MNEMONICS = set( - [ - "call", - "jb", - "jbe", - "jcxz", - "jecxz", - "jknzd", - "jkzd", - "jl", - "jle", - "jmp", - "jnb", - "jnbe", - "jnl", - "jnle", - "jno", - "jnp", - "jns", - "jnz", - "jo", - "jp", - "jrcxz", - "js", - "jz", - ] -) - - -def extract_insn_bytes_features(xtor, f, bb, insn): - """ - parse byte sequence features from the given instruction. - """ - if insn.mnemonic in FLOW_MNEMONICS: - return - - for operand in insn.operands: - try: - target = get_operand_target(insn, operand) - except ValueError: - continue - - for ptr in derefs(xtor, target): - try: - buf = read_bytes(xtor, ptr) - except ValueError: - continue - - if capa.features.extractors.helpers.all_zeros(buf): - continue - - yield Bytes(buf), insn.address - - -def first(s): - """enumerate the first element in the sequence""" - for i in s: - yield i - break - - -def extract_insn_string_features(xtor, f, bb, insn): - """parse string features from the given instruction.""" - for bytez, va in extract_insn_bytes_features(xtor, f, bb, insn): - buf = bytez.value - - for s in itertools.chain( - first(capa.features.extractors.strings.extract_ascii_strings(buf)), - first(capa.features.extractors.strings.extract_unicode_strings(buf)), - ): - if s.offset == 0: - yield String(s.s), va - - -def is_security_cookie(xtor, f, bb, insn): - """ - check if an instruction is related to security cookie checks - """ - op1 = insn.operands[1] - if op1[OPERAND_TYPE] == OPERAND_TYPE_REGISTER and op1[REGISTER_OPERAND_REGISTER] not in ( - "esp", - "ebp", - "rbp", - "rsp", - ): - return False - - # expect security cookie init in first basic block within first bytes (instructions) - if f == bb.address and insn.address < (bb.address + SECURITY_COOKIE_BYTES_DELTA): - return True - - # ... or within last bytes (instructions) before a return - insns = list(xtor.get_instructions(f, bb)) - if insns[-1].mnemonic in ("ret", "retn") and insn.address > (bb.address + bb.length - SECURITY_COOKIE_BYTES_DELTA): - return True - - return False - - -def extract_insn_nzxor_characteristic_features(xtor, f, bb, insn): - """ - parse non-zeroing XOR instruction from the given instruction. - ignore expected non-zeroing XORs, e.g. security cookies. - """ - if insn.mnemonic != "xor": - return - - operands = insn.operands - if operands[0] == operands[1]: - return - - if is_security_cookie(xtor, f, bb, insn): - return - - yield Characteristic("nzxor"), insn.address - - -def extract_insn_peb_access_characteristic_features(xtor, f, bb, insn): - """ - parse peb access from the given function. fs:[0x30] on x86, gs:[0x60] on x64 - """ - for operand in insn.operands: - if ( - operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY - and operand[MEMORY_OPERAND_SEGMENT] == "gs" - and operand[MEMORY_OPERAND_DISP] == 0x60 - ): - yield Characteristic("peb access"), insn.address - - if ( - operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY - and operand[MEMORY_OPERAND_SEGMENT] == "fs" - and operand[MEMORY_OPERAND_DISP] == 0x30 - ): - yield Characteristic("peb access"), insn.address - - -def extract_insn_segment_access_features(xtor, f, bb, insn): - """ parse the instruction for access to fs or gs """ - for operand in insn.operands: - if operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and operand[MEMORY_OPERAND_SEGMENT] == "gs": - yield Characteristic("gs access"), insn.address - - if operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and operand[MEMORY_OPERAND_SEGMENT] == "fs": - yield Characteristic("fs access"), insn.address - - -def get_section(xtor, va): - pe = get_pefile(xtor) - - for i, section in enumerate(pe.sections): - section_start = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress - section_end = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress + section.Misc_VirtualSize - - if section_start <= va < section_end: - return i - - raise ValueError("invalid address") - - -def extract_insn_cross_section_cflow(xtor, f, bb, insn): - """ - inspect the instruction for a CALL or JMP that crosses section boundaries. - """ - if insn.mnemonic not in FLOW_MNEMONICS: - return - - try: - target = get_operand_target(insn, insn.operands[0]) - except ValueError: - return - - if target in get_imports(xtor): - return - - try: - if get_section(xtor, insn.address) != get_section(xtor, target): - yield Characteristic("cross section flow"), insn.address - except ValueError: - return - - -def extract_function_calls_from(xtor, f, bb, insn): - cg = get_call_graph(xtor.ws) - - for callee in cg.calls_from.get(insn.address, []): - yield Characteristic("calls from"), callee - - if callee == f: - yield Characteristic("recursive call"), insn.address - - # lancelot doesn't count API calls when constructing the call graph - # so we still have to scan for calls to an import - if insn.mnemonic != "call": - return - - try: - target = get_operand_target(insn, insn.operands[0]) - except ValueError: - return - - imports = get_imports(xtor) - if target in imports: - yield Characteristic("calls from"), target - - -# this is a feature that's most relevant at the function or basic block scope, -# however, its most efficient to extract at the instruction scope. -def extract_function_indirect_call_characteristic_features(xtor, f, bb, insn): - """ - extract indirect function call characteristic (e.g., call eax or call dword ptr [edx+4]) - does not include calls like => call ds:dword_ABD4974 - """ - if insn.mnemonic != "call": - return - - op0 = insn.operands[0] - if op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER: - yield Characteristic("indirect call"), insn.address - elif op0[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and op0[MEMORY_OPERAND_BASE] is not None: - yield Characteristic("indirect call"), insn.address - elif op0[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and op0[MEMORY_OPERAND_INDEX] is not None: - yield Characteristic("indirect call"), insn.address - - -_not_implemented = set([]) - - -def extract_insn_features(xtor, f, bb, insn): - for insn_handler in INSTRUCTION_HANDLERS: - try: - for feature, va in insn_handler(xtor, f, bb, insn): - yield feature, va - except NotImplementedError: - if insn_handler.__name__ not in _not_implemented: - logger.warning("not implemented: %s", insn_handler.__name__) - _not_implemented.add(insn_handler.__name__) - - -INSTRUCTION_HANDLERS = ( - extract_insn_api_features, - extract_insn_number_features, - extract_insn_string_features, - extract_insn_bytes_features, - extract_insn_offset_features, - extract_insn_nzxor_characteristic_features, - extract_insn_mnemonic_features, - extract_insn_peb_access_characteristic_features, - extract_insn_cross_section_cflow, - extract_insn_segment_access_features, - extract_function_calls_from, - extract_function_indirect_call_characteristic_features, -) +import logging +import itertools + +import pefile + +try: + from functools import lru_cache +except ImportError: + from backports.functools_lru_cache import lru_cache + +from lancelot import ( + OPERAND_TYPE, + PERMISSION_READ, + MEMORY_OPERAND_BASE, + MEMORY_OPERAND_DISP, + OPERAND_TYPE_MEMORY, + MEMORY_OPERAND_INDEX, + OPERAND_TYPE_REGISTER, + MEMORY_OPERAND_SEGMENT, + OPERAND_TYPE_IMMEDIATE, + IMMEDIATE_OPERAND_VALUE, + REGISTER_OPERAND_REGISTER, + IMMEDIATE_OPERAND_IS_RELATIVE, +) + +import capa.features.extractors.helpers +from capa.features import ARCH_X32, ARCH_X64, MAX_BYTES_FEATURE_SIZE, Bytes, String, Characteristic +from capa.features.insn import Number, Offset, Mnemonic +from capa.features.extractors.lancelot.helpers import get_operand_target +from capa.features.extractors.lancelot.function import get_call_graph +from capa.features.extractors.lancelot.indirect_calls import NotFoundError, resolve_indirect_call + +logger = logging.getLogger(__name__) + + +# security cookie checks may perform non-zeroing XORs, these are expected within a certain +# byte range within the first and returning basic blocks, this helps to reduce FP features +SECURITY_COOKIE_BYTES_DELTA = 0x40 + + +def get_arch(ws): + if ws.arch == "x32": + return ARCH_X32 + elif ws.arch == "x64": + return ARCH_X64 + else: + raise ValueError("unexpected architecture") + + +@lru_cache +def get_pefile(xtor): + return pefile.PE(data=xtor.buf) + + +@lru_cache +def get_imports(xtor): + pe = get_pefile(xtor) + + imports = {} + for entry in pe.DIRECTORY_ENTRY_IMPORT: + libname = entry.dll.decode("ascii").lower().partition(".")[0] + for imp in entry.imports: + if imp.ordinal: + imports[imp.address] = "%s.#%s" % (libname, imp.ordinal) + else: + impname = imp.name.decode("ascii") + imports[imp.address] = "%s.%s" % (libname, impname) + return imports + + +@lru_cache +def get_thunks(xtor): + thunks = {} + for va in xtor.ws.get_functions(): + try: + insn = xtor.ws.read_insn(va) + except ValueError: + continue + + if insn.mnemonic != "jmp": + continue + + op0 = insn.operands[0] + + try: + target = get_operand_target(insn, op0) + except ValueError: + continue + + imports = get_imports(xtor) + if target not in imports: + continue + + thunks[va] = imports[target] + + return thunks + + +def extract_insn_api_features(xtor, f, bb, insn): + """parse API features from the given instruction.""" + + if insn.mnemonic != "call": + return + + op0 = insn.operands[0] + + if op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER: + try: + (_, target) = resolve_indirect_call(xtor.ws, f, insn) + except NotFoundError: + return + if target is None: + return + else: + try: + target = get_operand_target(insn, op0) + except ValueError: + return + + imports = get_imports(xtor) + if target in imports: + for feature, va in capa.features.extractors.helpers.generate_api_features(imports[target], insn.address): + yield feature, va + return + + thunks = get_thunks(xtor) + if target in thunks: + for feature, va in capa.features.extractors.helpers.generate_api_features(thunks[target], insn.address): + yield feature, va + + +def extract_insn_mnemonic_features(xtor, f, bb, insn): + """parse mnemonic features from the given instruction.""" + yield Mnemonic(insn.mnemonic), insn.address + + +def extract_insn_number_features(xtor, f, bb, insn): + """parse number features from the given instruction.""" + operands = insn.operands + + for operand in operands: + if operand[OPERAND_TYPE] != OPERAND_TYPE_IMMEDIATE: + continue + + v = operand[IMMEDIATE_OPERAND_VALUE] + + if xtor.ws.probe(v) & PERMISSION_READ: + # v is a valid address + # therefore, assume its not also a constant. + continue + + if ( + insn.mnemonic == "add" + and operands[0][OPERAND_TYPE] == OPERAND_TYPE_REGISTER + and operands[0][REGISTER_OPERAND_REGISTER] == "esp" + ): + # skip things like: + # + # .text:00401140 call sub_407E2B + # .text:00401145 add esp, 0Ch + return + + yield Number(v), insn.address + yield Number(v, arch=get_arch(xtor.ws)), insn.address + + +def extract_insn_offset_features(xtor, f, bb, insn): + """parse structure offset features from the given instruction.""" + operands = insn.operands + + for operand in operands: + if operand[OPERAND_TYPE] != OPERAND_TYPE_MEMORY: + continue + + if operand[MEMORY_OPERAND_BASE] in ("esp", "ebp", "rbp"): + continue + + # lancelot provides `None` when the displacement is not present. + v = operand[MEMORY_OPERAND_DISP] or 0 + + yield Offset(v), insn.address + yield Offset(v, arch=get_arch(xtor.ws)), insn.address + + +def derefs(xtor, p): + """ + recursively follow the given pointer, yielding the valid memory addresses along the way. + useful when you may have a pointer to string, or pointer to pointer to string, etc. + this is a "do what i mean" type of helper function. + """ + + depth = 0 + while True: + if not xtor.ws.probe(p) & PERMISSION_READ: + return + yield p + + next = xtor.ws.read_pointer(p) + + # sanity: pointer points to self + if next == p: + return + + # sanity: avoid chains of pointers that are unreasonably deep + depth += 1 + if depth > 10: + return + + p = next + + +def read_bytes(xtor, va): + """ + read up to MAX_BYTES_FEATURE_SIZE from the given address. + + raises: + ValueError: if the given address is not valid. + """ + start = va + end = va + MAX_BYTES_FEATURE_SIZE + pe = get_pefile(xtor) + + for section in pe.sections: + section_start = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress + section_end = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress + section.Misc_VirtualSize + + if section_start <= start < section_end: + end = min(end, section_end) + return xtor.ws.read_bytes(start, end - start) + + raise ValueError("invalid address") + + +# these are mnemonics that may flow (jump) elsewhere +FLOW_MNEMONICS = set( + [ + "call", + "jb", + "jbe", + "jcxz", + "jecxz", + "jknzd", + "jkzd", + "jl", + "jle", + "jmp", + "jnb", + "jnbe", + "jnl", + "jnle", + "jno", + "jnp", + "jns", + "jnz", + "jo", + "jp", + "jrcxz", + "js", + "jz", + ] +) + + +def extract_insn_bytes_features(xtor, f, bb, insn): + """ + parse byte sequence features from the given instruction. + """ + if insn.mnemonic in FLOW_MNEMONICS: + return + + for operand in insn.operands: + try: + target = get_operand_target(insn, operand) + except ValueError: + continue + + for ptr in derefs(xtor, target): + try: + buf = read_bytes(xtor, ptr) + except ValueError: + continue + + if capa.features.extractors.helpers.all_zeros(buf): + continue + + yield Bytes(buf), insn.address + + +def first(s): + """enumerate the first element in the sequence""" + for i in s: + yield i + break + + +def extract_insn_string_features(xtor, f, bb, insn): + """parse string features from the given instruction.""" + for bytez, va in extract_insn_bytes_features(xtor, f, bb, insn): + buf = bytez.value + + for s in itertools.chain( + first(capa.features.extractors.strings.extract_ascii_strings(buf)), + first(capa.features.extractors.strings.extract_unicode_strings(buf)), + ): + if s.offset == 0: + yield String(s.s), va + + +def is_security_cookie(xtor, f, bb, insn): + """ + check if an instruction is related to security cookie checks + """ + op1 = insn.operands[1] + if op1[OPERAND_TYPE] == OPERAND_TYPE_REGISTER and op1[REGISTER_OPERAND_REGISTER] not in ( + "esp", + "ebp", + "rbp", + "rsp", + ): + return False + + # expect security cookie init in first basic block within first bytes (instructions) + if f == bb.address and insn.address < (bb.address + SECURITY_COOKIE_BYTES_DELTA): + return True + + # ... or within last bytes (instructions) before a return + insns = list(xtor.get_instructions(f, bb)) + if insns[-1].mnemonic in ("ret", "retn") and insn.address > (bb.address + bb.length - SECURITY_COOKIE_BYTES_DELTA): + return True + + return False + + +def extract_insn_nzxor_characteristic_features(xtor, f, bb, insn): + """ + parse non-zeroing XOR instruction from the given instruction. + ignore expected non-zeroing XORs, e.g. security cookies. + """ + if insn.mnemonic != "xor": + return + + operands = insn.operands + if operands[0] == operands[1]: + return + + if is_security_cookie(xtor, f, bb, insn): + return + + yield Characteristic("nzxor"), insn.address + + +def extract_insn_peb_access_characteristic_features(xtor, f, bb, insn): + """ + parse peb access from the given function. fs:[0x30] on x86, gs:[0x60] on x64 + """ + for operand in insn.operands: + if ( + operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY + and operand[MEMORY_OPERAND_SEGMENT] == "gs" + and operand[MEMORY_OPERAND_DISP] == 0x60 + ): + yield Characteristic("peb access"), insn.address + + if ( + operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY + and operand[MEMORY_OPERAND_SEGMENT] == "fs" + and operand[MEMORY_OPERAND_DISP] == 0x30 + ): + yield Characteristic("peb access"), insn.address + + +def extract_insn_segment_access_features(xtor, f, bb, insn): + """ parse the instruction for access to fs or gs """ + for operand in insn.operands: + if operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and operand[MEMORY_OPERAND_SEGMENT] == "gs": + yield Characteristic("gs access"), insn.address + + if operand[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and operand[MEMORY_OPERAND_SEGMENT] == "fs": + yield Characteristic("fs access"), insn.address + + +def get_section(xtor, va): + pe = get_pefile(xtor) + + for i, section in enumerate(pe.sections): + section_start = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress + section_end = pe.OPTIONAL_HEADER.ImageBase + section.VirtualAddress + section.Misc_VirtualSize + + if section_start <= va < section_end: + return i + + raise ValueError("invalid address") + + +def extract_insn_cross_section_cflow(xtor, f, bb, insn): + """ + inspect the instruction for a CALL or JMP that crosses section boundaries. + """ + if insn.mnemonic not in FLOW_MNEMONICS: + return + + try: + target = get_operand_target(insn, insn.operands[0]) + except ValueError: + return + + if target in get_imports(xtor): + return + + try: + if get_section(xtor, insn.address) != get_section(xtor, target): + yield Characteristic("cross section flow"), insn.address + except ValueError: + return + + +def extract_function_calls_from(xtor, f, bb, insn): + cg = get_call_graph(xtor.ws) + + for callee in cg.calls_from.get(insn.address, []): + yield Characteristic("calls from"), callee + + if callee == f: + yield Characteristic("recursive call"), insn.address + + # lancelot doesn't count API calls when constructing the call graph + # so we still have to scan for calls to an import + if insn.mnemonic != "call": + return + + try: + target = get_operand_target(insn, insn.operands[0]) + except ValueError: + return + + imports = get_imports(xtor) + if target in imports: + yield Characteristic("calls from"), target + + +# this is a feature that's most relevant at the function or basic block scope, +# however, its most efficient to extract at the instruction scope. +def extract_function_indirect_call_characteristic_features(xtor, f, bb, insn): + """ + extract indirect function call characteristic (e.g., call eax or call dword ptr [edx+4]) + does not include calls like => call ds:dword_ABD4974 + """ + if insn.mnemonic != "call": + return + + op0 = insn.operands[0] + if op0[OPERAND_TYPE] == OPERAND_TYPE_REGISTER: + yield Characteristic("indirect call"), insn.address + elif op0[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and op0[MEMORY_OPERAND_BASE] is not None: + yield Characteristic("indirect call"), insn.address + elif op0[OPERAND_TYPE] == OPERAND_TYPE_MEMORY and op0[MEMORY_OPERAND_INDEX] is not None: + yield Characteristic("indirect call"), insn.address + + +_not_implemented = set([]) + + +def extract_insn_features(xtor, f, bb, insn): + for insn_handler in INSTRUCTION_HANDLERS: + try: + for feature, va in insn_handler(xtor, f, bb, insn): + yield feature, va + except NotImplementedError: + if insn_handler.__name__ not in _not_implemented: + logger.warning("not implemented: %s", insn_handler.__name__) + _not_implemented.add(insn_handler.__name__) + + +INSTRUCTION_HANDLERS = ( + extract_insn_api_features, + extract_insn_number_features, + extract_insn_string_features, + extract_insn_bytes_features, + extract_insn_offset_features, + extract_insn_nzxor_characteristic_features, + extract_insn_mnemonic_features, + extract_insn_peb_access_characteristic_features, + extract_insn_cross_section_cflow, + extract_insn_segment_access_features, + extract_function_calls_from, + extract_function_indirect_call_characteristic_features, +) diff --git a/rules b/rules index 7e52464e..b3dfadeb 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit 7e52464e6e86a12b237c0dc9d0a5b9f7c47892f3 +Subproject commit b3dfadebeaa446c24b9810374c906ea53eb54a71 diff --git a/scripts/import-to-bn.py b/scripts/import-to-bn.py index 69e050cc..20851b33 100644 --- a/scripts/import-to-bn.py +++ b/scripts/import-to-bn.py @@ -1,112 +1,112 @@ -""" -Binary Ninja plugin that imports a capa report, -produced via `capa --json /path/to/sample`, -into the current database. - -It will mark up functions with their capa matches, like: - - ; capa: print debug messages (host-interaction/log/debug/write-event) - ; capa: delete service (host-interaction/service/delete) - ; Attributes: bp-based frame - - public UninstallService - UninstallService proc near - ... - -To use, invoke from the Binary Ninja Tools menu, or from the -command-palette. - -Adapted for Binary Ninja by @psifertex - -This script will verify that the report matches the workspace. -Check the log window for any errors, and/or the summary of changes. - -Derived from: https://github.com/fireeye/capa/blob/master/scripts/import-to-ida.py -""" -import os -import json - -from binaryninja import * - - -def append_func_cmt(bv, va, cmt): - """ - add the given comment to the given function, - if it doesn't already exist. - """ - func = bv.get_function_at(va) - if not func: - raise ValueError("not a function") - - if cmt in func.comment: - return - - func.comment = func.comment + "\n" + cmt - - -def load_analysis(bv): - shortname = os.path.splitext(os.path.basename(bv.file.filename))[0] - dirname = os.path.dirname(bv.file.filename) - log_info(f"dirname: {dirname}\nshortname: {shortname}\n") - if os.access(os.path.join(dirname, shortname + ".js"), os.R_OK): - path = os.path.join(dirname, shortname + ".js") - elif os.access(os.path.join(dirname, shortname + ".json"), os.R_OK): - path = os.path.join(dirname, shortname + ".json") - else: - path = interaction.get_open_filename_input("capa report:", "JSON (*.js *.json);;All Files (*)") - if not path or not os.access(path, os.R_OK): - log_error("Invalid filename.") - return 0 - log_info("Using capa file %s" % path) - - with open(path, "rb") as f: - doc = json.loads(f.read().decode("utf-8")) - - if "meta" not in doc or "rules" not in doc: - log_error("doesn't appear to be a capa report") - return -1 - - a = doc["meta"]["sample"]["md5"].lower() - md5 = Transform["MD5"] - rawhex = Transform["RawHex"] - b = rawhex.encode(md5.encode(bv.parent_view.read(bv.parent_view.start, bv.parent_view.end))).decode("utf-8") - if not a == b: - log_error("sample mismatch") - return -2 - - rows = [] - for rule in doc["rules"].values(): - if rule["meta"].get("lib"): - continue - if rule["meta"].get("capa/subscope"): - continue - if rule["meta"]["scope"] != "function": - continue - - name = rule["meta"]["name"] - ns = rule["meta"].get("namespace", "") - for va in rule["matches"].keys(): - va = int(va) - rows.append((ns, name, va)) - - # order by (namespace, name) so that like things show up together - rows = sorted(rows) - for ns, name, va in rows: - if ns: - cmt = "%s (%s)" % (name, ns) - else: - cmt = "%s" % (name,) - - log_info("0x%x: %s" % (va, cmt)) - try: - # message will look something like: - # - # capa: delete service (host-interaction/service/delete) - append_func_cmt(bv, va, "capa: " + cmt) - except ValueError: - continue - - log_info("ok") - - -PluginCommand.register("Load capa file", "Loads an analysis file from capa", load_analysis) +""" +Binary Ninja plugin that imports a capa report, +produced via `capa --json /path/to/sample`, +into the current database. + +It will mark up functions with their capa matches, like: + + ; capa: print debug messages (host-interaction/log/debug/write-event) + ; capa: delete service (host-interaction/service/delete) + ; Attributes: bp-based frame + + public UninstallService + UninstallService proc near + ... + +To use, invoke from the Binary Ninja Tools menu, or from the +command-palette. + +Adapted for Binary Ninja by @psifertex + +This script will verify that the report matches the workspace. +Check the log window for any errors, and/or the summary of changes. + +Derived from: https://github.com/fireeye/capa/blob/master/scripts/import-to-ida.py +""" +import os +import json + +from binaryninja import * + + +def append_func_cmt(bv, va, cmt): + """ + add the given comment to the given function, + if it doesn't already exist. + """ + func = bv.get_function_at(va) + if not func: + raise ValueError("not a function") + + if cmt in func.comment: + return + + func.comment = func.comment + "\n" + cmt + + +def load_analysis(bv): + shortname = os.path.splitext(os.path.basename(bv.file.filename))[0] + dirname = os.path.dirname(bv.file.filename) + log_info(f"dirname: {dirname}\nshortname: {shortname}\n") + if os.access(os.path.join(dirname, shortname + ".js"), os.R_OK): + path = os.path.join(dirname, shortname + ".js") + elif os.access(os.path.join(dirname, shortname + ".json"), os.R_OK): + path = os.path.join(dirname, shortname + ".json") + else: + path = interaction.get_open_filename_input("capa report:", "JSON (*.js *.json);;All Files (*)") + if not path or not os.access(path, os.R_OK): + log_error("Invalid filename.") + return 0 + log_info("Using capa file %s" % path) + + with open(path, "rb") as f: + doc = json.loads(f.read().decode("utf-8")) + + if "meta" not in doc or "rules" not in doc: + log_error("doesn't appear to be a capa report") + return -1 + + a = doc["meta"]["sample"]["md5"].lower() + md5 = Transform["MD5"] + rawhex = Transform["RawHex"] + b = rawhex.encode(md5.encode(bv.parent_view.read(bv.parent_view.start, bv.parent_view.end))).decode("utf-8") + if not a == b: + log_error("sample mismatch") + return -2 + + rows = [] + for rule in doc["rules"].values(): + if rule["meta"].get("lib"): + continue + if rule["meta"].get("capa/subscope"): + continue + if rule["meta"]["scope"] != "function": + continue + + name = rule["meta"]["name"] + ns = rule["meta"].get("namespace", "") + for va in rule["matches"].keys(): + va = int(va) + rows.append((ns, name, va)) + + # order by (namespace, name) so that like things show up together + rows = sorted(rows) + for ns, name, va in rows: + if ns: + cmt = "%s (%s)" % (name, ns) + else: + cmt = "%s" % (name,) + + log_info("0x%x: %s" % (va, cmt)) + try: + # message will look something like: + # + # capa: delete service (host-interaction/service/delete) + append_func_cmt(bv, va, "capa: " + cmt) + except ValueError: + continue + + log_info("ok") + + +PluginCommand.register("Load capa file", "Loads an analysis file from capa", load_analysis) diff --git a/scripts/import-to-ida.py b/scripts/import-to-ida.py index 9a5caff2..c52e2cd0 100644 --- a/scripts/import-to-ida.py +++ b/scripts/import-to-ida.py @@ -1,117 +1,117 @@ -""" -IDA Pro script that imports a capa report, -produced via `capa --json /path/to/sample`, -into the current database. - -It will mark up functions with their capa matches, like: - - ; capa: print debug messages (host-interaction/log/debug/write-event) - ; capa: delete service (host-interaction/service/delete) - ; Attributes: bp-based frame - - public UninstallService - UninstallService proc near - ... - -To use, invoke from the IDA Pro scripting dialog, -such as via Alt-F9, -and then select the existing capa report from the file system. - -This script will verify that the report matches the workspace. -Check the output window for any errors, and/or the summary of changes. - -Copyright (C) 2020 FireEye, Inc. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. -You may obtain a copy of the License at: [package root]/LICENSE.txt -Unless required by applicable law or agreed to in writing, software distributed under the License - is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and limitations under the License. -""" -import json -import logging - -import idc -import idautils -import ida_idaapi -import ida_kernwin - -logger = logging.getLogger("capa") - - -def append_func_cmt(va, cmt, repeatable=False): - """ - add the given comment to the given function, - if it doesn't already exist. - """ - func = ida_funcs.get_func(va) - if not func: - raise ValueError("not a function") - - existing = ida_funcs.get_func_cmt(func, repeatable) or "" - if cmt in existing: - return - - new = existing + "\n" + cmt - ida_funcs.set_func_cmt(func, new, repeatable) - - -def main(): - path = ida_kernwin.ask_file(False, "*", "capa report") - if not path: - return 0 - - with open(path, "rb") as f: - doc = json.loads(f.read().decode("utf-8")) - - if "meta" not in doc or "rules" not in doc: - logger.error("doesn't appear to be a capa report") - return -1 - - # in IDA 7.4, the MD5 hash may be truncated, for example: - # wanted: 84882c9d43e23d63b82004fae74ebb61 - # found: b'84882C9D43E23D63B82004FAE74EBB6\x00' - # - # see: https://github.com/idapython/bin/issues/11 - a = doc["meta"]["sample"]["md5"].lower() - b = idautils.GetInputFileMD5().decode("ascii").lower().rstrip("\x00") - if not a.startswith(b): - logger.error("sample mismatch") - return -2 - - rows = [] - for rule in doc["rules"].values(): - if rule["meta"].get("lib"): - continue - if rule["meta"].get("capa/subscope"): - continue - if rule["meta"]["scope"] != "function": - continue - - name = rule["meta"]["name"] - ns = rule["meta"].get("namespace", "") - for va in rule["matches"].keys(): - va = int(va) - rows.append((ns, name, va)) - - # order by (namespace, name) so that like things show up together - rows = sorted(rows) - for ns, name, va in rows: - if ns: - cmt = "%s (%s)" % (name, ns) - else: - cmt = "%s" % (name,) - - logger.info("0x%x: %s", va, cmt) - try: - # message will look something like: - # - # capa: delete service (host-interaction/service/delete) - append_func_cmt(va, "capa: " + cmt, repeatable=False) - except ValueError: - continue - - logger.info("ok") - - -main() +""" +IDA Pro script that imports a capa report, +produced via `capa --json /path/to/sample`, +into the current database. + +It will mark up functions with their capa matches, like: + + ; capa: print debug messages (host-interaction/log/debug/write-event) + ; capa: delete service (host-interaction/service/delete) + ; Attributes: bp-based frame + + public UninstallService + UninstallService proc near + ... + +To use, invoke from the IDA Pro scripting dialog, +such as via Alt-F9, +and then select the existing capa report from the file system. + +This script will verify that the report matches the workspace. +Check the output window for any errors, and/or the summary of changes. + +Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. +You may obtain a copy of the License at: [package root]/LICENSE.txt +Unless required by applicable law or agreed to in writing, software distributed under the License + is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and limitations under the License. +""" +import json +import logging + +import idc +import idautils +import ida_idaapi +import ida_kernwin + +logger = logging.getLogger("capa") + + +def append_func_cmt(va, cmt, repeatable=False): + """ + add the given comment to the given function, + if it doesn't already exist. + """ + func = ida_funcs.get_func(va) + if not func: + raise ValueError("not a function") + + existing = ida_funcs.get_func_cmt(func, repeatable) or "" + if cmt in existing: + return + + new = existing + "\n" + cmt + ida_funcs.set_func_cmt(func, new, repeatable) + + +def main(): + path = ida_kernwin.ask_file(False, "*", "capa report") + if not path: + return 0 + + with open(path, "rb") as f: + doc = json.loads(f.read().decode("utf-8")) + + if "meta" not in doc or "rules" not in doc: + logger.error("doesn't appear to be a capa report") + return -1 + + # in IDA 7.4, the MD5 hash may be truncated, for example: + # wanted: 84882c9d43e23d63b82004fae74ebb61 + # found: b'84882C9D43E23D63B82004FAE74EBB6\x00' + # + # see: https://github.com/idapython/bin/issues/11 + a = doc["meta"]["sample"]["md5"].lower() + b = idautils.GetInputFileMD5().decode("ascii").lower().rstrip("\x00") + if not a.startswith(b): + logger.error("sample mismatch") + return -2 + + rows = [] + for rule in doc["rules"].values(): + if rule["meta"].get("lib"): + continue + if rule["meta"].get("capa/subscope"): + continue + if rule["meta"]["scope"] != "function": + continue + + name = rule["meta"]["name"] + ns = rule["meta"].get("namespace", "") + for va in rule["matches"].keys(): + va = int(va) + rows.append((ns, name, va)) + + # order by (namespace, name) so that like things show up together + rows = sorted(rows) + for ns, name, va in rows: + if ns: + cmt = "%s (%s)" % (name, ns) + else: + cmt = "%s" % (name,) + + logger.info("0x%x: %s", va, cmt) + try: + # message will look something like: + # + # capa: delete service (host-interaction/service/delete) + append_func_cmt(va, "capa: " + cmt, repeatable=False) + except ValueError: + continue + + logger.info("ok") + + +main() diff --git a/tests/test_ida_features.py b/tests/test_ida_features.py index b227775e..86fd79b3 100644 --- a/tests/test_ida_features.py +++ b/tests/test_ida_features.py @@ -1,104 +1,104 @@ -# run this script from within IDA with ./tests/data/mimikatz.exe open -import sys -import logging -import os.path -import binascii -import traceback - -import pytest - -try: - sys.path.append(os.path.dirname(__file__)) - from fixtures import * -finally: - sys.path.pop() - - -logger = logging.getLogger("test_ida_features") - - -def check_input_file(wanted): - import idautils - - # some versions (7.4) of IDA return a truncated version of the MD5. - # https://github.com/idapython/bin/issues/11 - try: - found = idautils.GetInputFileMD5()[:31].decode("ascii").lower() - except UnicodeDecodeError: - # in IDA 7.5 or so, GetInputFileMD5 started returning raw binary - # rather than the hex digest - found = binascii.hexlify(idautils.GetInputFileMD5()[:15]).decode("ascii").lower() - - if not wanted.startswith(found): - raise RuntimeError("please run the tests against sample with MD5: `%s`" % (wanted)) - - -def get_ida_extractor(_path): - check_input_file("5f66b82558ca92e54e77f216ef4c066c") - - # have to import import this inline so pytest doesn't bail outside of IDA - import capa.features.extractors.ida - - return capa.features.extractors.ida.IdaFeatureExtractor() - - -@pytest.mark.skip(reason="IDA Pro tests must be run within IDA") -def test_ida_features(): - for (sample, scope, feature, expected) in FEATURE_PRESENCE_TESTS: - id = make_test_id((sample, scope, feature, expected)) - - try: - check_input_file(get_sample_md5_by_name(sample)) - except RuntimeError: - print("SKIP %s" % (id)) - continue - - scope = resolve_scope(scope) - sample = resolve_sample(sample) - - try: - do_test_feature_presence(get_ida_extractor, sample, scope, feature, expected) - except Exception as e: - print("FAIL %s" % (id)) - traceback.print_exc() - else: - print("OK %s" % (id)) - - -@pytest.mark.skip(reason="IDA Pro tests must be run within IDA") -def test_ida_feature_counts(): - for (sample, scope, feature, expected) in FEATURE_COUNT_TESTS: - id = make_test_id((sample, scope, feature, expected)) - - try: - check_input_file(get_sample_md5_by_name(sample)) - except RuntimeError: - print("SKIP %s" % (id)) - continue - - scope = resolve_scope(scope) - sample = resolve_sample(sample) - - try: - do_test_feature_count(get_ida_extractor, sample, scope, feature, expected) - except Exception as e: - print("FAIL %s" % (id)) - traceback.print_exc() - else: - print("OK %s" % (id)) - - -if __name__ == "__main__": - print("-" * 80) - - # invoke all functions in this module that start with `test_` - for name in dir(sys.modules[__name__]): - if not name.startswith("test_"): - continue - - test = getattr(sys.modules[__name__], name) - logger.debug("invoking test: %s", name) - sys.stderr.flush() - test() - - print("DONE") +# run this script from within IDA with ./tests/data/mimikatz.exe open +import sys +import logging +import os.path +import binascii +import traceback + +import pytest + +try: + sys.path.append(os.path.dirname(__file__)) + from fixtures import * +finally: + sys.path.pop() + + +logger = logging.getLogger("test_ida_features") + + +def check_input_file(wanted): + import idautils + + # some versions (7.4) of IDA return a truncated version of the MD5. + # https://github.com/idapython/bin/issues/11 + try: + found = idautils.GetInputFileMD5()[:31].decode("ascii").lower() + except UnicodeDecodeError: + # in IDA 7.5 or so, GetInputFileMD5 started returning raw binary + # rather than the hex digest + found = binascii.hexlify(idautils.GetInputFileMD5()[:15]).decode("ascii").lower() + + if not wanted.startswith(found): + raise RuntimeError("please run the tests against sample with MD5: `%s`" % (wanted)) + + +def get_ida_extractor(_path): + check_input_file("5f66b82558ca92e54e77f216ef4c066c") + + # have to import import this inline so pytest doesn't bail outside of IDA + import capa.features.extractors.ida + + return capa.features.extractors.ida.IdaFeatureExtractor() + + +@pytest.mark.skip(reason="IDA Pro tests must be run within IDA") +def test_ida_features(): + for (sample, scope, feature, expected) in FEATURE_PRESENCE_TESTS: + id = make_test_id((sample, scope, feature, expected)) + + try: + check_input_file(get_sample_md5_by_name(sample)) + except RuntimeError: + print("SKIP %s" % (id)) + continue + + scope = resolve_scope(scope) + sample = resolve_sample(sample) + + try: + do_test_feature_presence(get_ida_extractor, sample, scope, feature, expected) + except Exception as e: + print("FAIL %s" % (id)) + traceback.print_exc() + else: + print("OK %s" % (id)) + + +@pytest.mark.skip(reason="IDA Pro tests must be run within IDA") +def test_ida_feature_counts(): + for (sample, scope, feature, expected) in FEATURE_COUNT_TESTS: + id = make_test_id((sample, scope, feature, expected)) + + try: + check_input_file(get_sample_md5_by_name(sample)) + except RuntimeError: + print("SKIP %s" % (id)) + continue + + scope = resolve_scope(scope) + sample = resolve_sample(sample) + + try: + do_test_feature_count(get_ida_extractor, sample, scope, feature, expected) + except Exception as e: + print("FAIL %s" % (id)) + traceback.print_exc() + else: + print("OK %s" % (id)) + + +if __name__ == "__main__": + print("-" * 80) + + # invoke all functions in this module that start with `test_` + for name in dir(sys.modules[__name__]): + if not name.startswith("test_"): + continue + + test = getattr(sys.modules[__name__], name) + logger.debug("invoking test: %s", name) + sys.stderr.flush() + test() + + print("DONE") diff --git a/tests/test_lancelot_features.py b/tests/test_lancelot_features.py index 4c78a818..e83b00af 100644 --- a/tests/test_lancelot_features.py +++ b/tests/test_lancelot_features.py @@ -1,26 +1,26 @@ -# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: [package root]/LICENSE.txt -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and limitations under the License. - - -from fixtures import * - - -@parametrize( - "sample,scope,feature,expected", FEATURE_PRESENCE_TESTS, indirect=["sample", "scope"], -) -def test_lancelot_features(sample, scope, feature, expected): - with xfail(sys.version_info < (3, 0), reason="lancelot only works on py3"): - do_test_feature_presence(get_lancelot_extractor, sample, scope, feature, expected) - - -@parametrize( - "sample,scope,feature,expected", FEATURE_COUNT_TESTS, indirect=["sample", "scope"], -) -def test_lancelot_feature_counts(sample, scope, feature, expected): - with xfail(sys.version_info < (3, 0), reason="lancelot only works on py3"): - do_test_feature_count(get_lancelot_extractor, sample, scope, feature, expected) +# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + + +from fixtures import * + + +@parametrize( + "sample,scope,feature,expected", FEATURE_PRESENCE_TESTS, indirect=["sample", "scope"], +) +def test_lancelot_features(sample, scope, feature, expected): + with xfail(sys.version_info < (3, 0), reason="lancelot only works on py3"): + do_test_feature_presence(get_lancelot_extractor, sample, scope, feature, expected) + + +@parametrize( + "sample,scope,feature,expected", FEATURE_COUNT_TESTS, indirect=["sample", "scope"], +) +def test_lancelot_feature_counts(sample, scope, feature, expected): + with xfail(sys.version_info < (3, 0), reason="lancelot only works on py3"): + do_test_feature_count(get_lancelot_extractor, sample, scope, feature, expected) From 854e3d7774c0421c0d801525c0c35e0b15cedf4c Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 20 Aug 2020 15:15:14 -0600 Subject: [PATCH 44/44] submodule: rules update --- rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules b/rules index b3dfadeb..2f59740f 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit b3dfadebeaa446c24b9810374c906ea53eb54a71 +Subproject commit 2f59740f32dfdb3cb78913c4adc0cf1374763fd3