mirror of
https://github.com/mandiant/capa.git
synced 2025-12-22 07:10:29 -08:00
510 lines
18 KiB
Python
510 lines
18 KiB
Python
# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at: [package root]/LICENSE.txt
|
|
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and limitations under the License.
|
|
from typing import Any, Dict, Tuple, Iterator
|
|
|
|
import idc
|
|
import idaapi
|
|
import idautils
|
|
|
|
import capa.features.extractors.helpers
|
|
import capa.features.extractors.ida.helpers
|
|
from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
|
|
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic
|
|
from capa.features.address import Address, AbsoluteVirtualAddress
|
|
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
|
|
|
|
# security cookie checks may perform non-zeroing XORs, these are expected within a certain
|
|
# byte range within the first and returning basic blocks, this helps to reduce FP features
|
|
SECURITY_COOKIE_BYTES_DELTA = 0x40
|
|
|
|
|
|
def get_imports(ctx: Dict[str, Any]) -> Dict[int, Any]:
|
|
if "imports_cache" not in ctx:
|
|
ctx["imports_cache"] = capa.features.extractors.ida.helpers.get_file_imports()
|
|
return ctx["imports_cache"]
|
|
|
|
|
|
def get_externs(ctx: Dict[str, Any]) -> Dict[int, Any]:
|
|
if "externs_cache" not in ctx:
|
|
ctx["externs_cache"] = capa.features.extractors.ida.helpers.get_file_externs()
|
|
return ctx["externs_cache"]
|
|
|
|
|
|
def check_for_api_call(insn: idaapi.insn_t, funcs: Dict[int, Any]) -> Iterator[Any]:
|
|
"""check instruction for API call"""
|
|
info = ()
|
|
ref = insn.ea
|
|
|
|
# attempt to resolve API calls by following chained thunks to a reasonable depth
|
|
for _ in range(THUNK_CHAIN_DEPTH_DELTA):
|
|
# assume only one code/data ref when resolving "call" or "jmp"
|
|
try:
|
|
ref = tuple(idautils.CodeRefsFrom(ref, False))[0]
|
|
except IndexError:
|
|
try:
|
|
# thunks may be marked as data refs
|
|
ref = tuple(idautils.DataRefsFrom(ref))[0]
|
|
except IndexError:
|
|
break
|
|
|
|
info = funcs.get(ref, ())
|
|
if info:
|
|
break
|
|
|
|
f = idaapi.get_func(ref)
|
|
if not f or not (f.flags & idaapi.FUNC_THUNK):
|
|
break
|
|
|
|
if info:
|
|
yield info
|
|
|
|
|
|
def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
|
|
"""
|
|
parse instruction API features
|
|
|
|
example:
|
|
call dword [0x00473038]
|
|
"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
if insn.get_canon_mnem() not in ("call", "jmp"):
|
|
return
|
|
|
|
# check calls to imported functions
|
|
for api in check_for_api_call(insn, get_imports(fh.ctx)):
|
|
# tuple (<module>, <function>, <ordinal>)
|
|
for name in capa.features.extractors.helpers.generate_symbols(api[0], api[1]):
|
|
yield API(name), ih.address
|
|
|
|
# check calls to extern functions
|
|
for api in check_for_api_call(insn, get_externs(fh.ctx)):
|
|
# tuple (<module>, <function>, <ordinal>)
|
|
yield API(api[1]), ih.address
|
|
|
|
# extract IDA/FLIRT recognized API functions
|
|
targets = tuple(idautils.CodeRefsFrom(insn.ea, False))
|
|
if not targets:
|
|
return
|
|
|
|
target = targets[0]
|
|
target_func = idaapi.get_func(target)
|
|
if not target_func or target_func.start_ea != target:
|
|
# not a function (start)
|
|
return
|
|
|
|
if target_func.flags & idaapi.FUNC_LIB:
|
|
name = idaapi.get_name(target_func.start_ea)
|
|
yield API(name), ih.address
|
|
if name.startswith("_"):
|
|
# some linkers may prefix linked routines with a `_` to avoid name collisions.
|
|
# extract features for both the mangled and un-mangled representations.
|
|
# e.g. `_fwrite` -> `fwrite`
|
|
# see: https://stackoverflow.com/a/2628384/87207
|
|
yield API(name[1:]), ih.address
|
|
|
|
|
|
def extract_insn_number_features(
|
|
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
|
) -> Iterator[Tuple[Feature, Address]]:
|
|
"""
|
|
parse instruction number features
|
|
example:
|
|
push 3136B0h ; dwControlCode
|
|
"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
if idaapi.is_ret_insn(insn):
|
|
# skip things like:
|
|
# .text:0042250E retn 8
|
|
return
|
|
|
|
if capa.features.extractors.ida.helpers.is_sp_modified(insn):
|
|
# skip things like:
|
|
# .text:00401145 add esp, 0Ch
|
|
return
|
|
|
|
for i, op in enumerate(insn.ops):
|
|
if op.type == idaapi.o_void:
|
|
break
|
|
if op.type not in (idaapi.o_imm, idaapi.o_mem):
|
|
continue
|
|
# skip things like:
|
|
# .text:00401100 shr eax, offset loc_C
|
|
if capa.features.extractors.ida.helpers.is_op_offset(insn, op):
|
|
continue
|
|
|
|
if op.type == idaapi.o_imm:
|
|
const = capa.features.extractors.ida.helpers.mask_op_val(op)
|
|
else:
|
|
const = op.addr
|
|
|
|
yield Number(const), ih.address
|
|
yield OperandNumber(i, const), ih.address
|
|
|
|
if insn.itype == idaapi.NN_add and 0 < const < MAX_STRUCTURE_SIZE and op.type == idaapi.o_imm:
|
|
# for pattern like:
|
|
#
|
|
# add eax, 0x10
|
|
#
|
|
# assume 0x10 is also an offset (imagine eax is a pointer).
|
|
yield Offset(const), ih.address
|
|
yield OperandOffset(i, const), ih.address
|
|
|
|
|
|
def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
|
|
"""
|
|
parse referenced byte sequences
|
|
example:
|
|
push offset iid_004118d4_IShellLinkA ; riid
|
|
"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
if idaapi.is_call_insn(insn):
|
|
return
|
|
|
|
ref = capa.features.extractors.ida.helpers.find_data_reference_from_insn(insn)
|
|
if ref != insn.ea:
|
|
extracted_bytes = capa.features.extractors.ida.helpers.read_bytes_at(ref, MAX_BYTES_FEATURE_SIZE)
|
|
if extracted_bytes and not capa.features.extractors.helpers.all_zeros(extracted_bytes):
|
|
if not capa.features.extractors.ida.helpers.find_string_at(ref):
|
|
# don't extract byte features for obvious strings
|
|
yield Bytes(extracted_bytes), ih.address
|
|
|
|
|
|
def extract_insn_string_features(
|
|
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
|
) -> Iterator[Tuple[Feature, Address]]:
|
|
"""
|
|
parse instruction string features
|
|
|
|
example:
|
|
push offset aAcr ; "ACR > "
|
|
"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
ref = capa.features.extractors.ida.helpers.find_data_reference_from_insn(insn)
|
|
if ref != insn.ea:
|
|
found = capa.features.extractors.ida.helpers.find_string_at(ref)
|
|
if found:
|
|
yield String(found), ih.address
|
|
|
|
|
|
def extract_insn_offset_features(
|
|
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
|
) -> Iterator[Tuple[Feature, Address]]:
|
|
"""
|
|
parse instruction structure offset features
|
|
|
|
example:
|
|
.text:0040112F cmp [esi+4], ebx
|
|
"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
for i, op in enumerate(insn.ops):
|
|
if op.type == idaapi.o_void:
|
|
break
|
|
if op.type not in (idaapi.o_phrase, idaapi.o_displ):
|
|
continue
|
|
if capa.features.extractors.ida.helpers.is_op_stack_var(insn.ea, op.n):
|
|
continue
|
|
|
|
p_info = capa.features.extractors.ida.helpers.get_op_phrase_info(op)
|
|
|
|
op_off = p_info.get("offset")
|
|
if op_off is None:
|
|
continue
|
|
|
|
if idaapi.is_mapped(op_off):
|
|
# Ignore:
|
|
# mov esi, dword_1005B148[esi]
|
|
continue
|
|
|
|
# I believe that IDA encodes all offsets as two's complement in a u32.
|
|
# a 64-bit displacement isn't a thing, see:
|
|
# https://stackoverflow.com/questions/31853189/x86-64-assembly-why-displacement-not-64-bits
|
|
op_off = capa.features.extractors.helpers.twos_complement(op_off, 32)
|
|
|
|
yield Offset(op_off), ih.address
|
|
yield OperandOffset(i, op_off), ih.address
|
|
|
|
if (
|
|
insn.itype == idaapi.NN_lea
|
|
and i == 1
|
|
# o_displ is used for both:
|
|
# [eax+1]
|
|
# [eax+ebx+2]
|
|
and op.type == idaapi.o_displ
|
|
# but the SIB is only present for [eax+ebx+2]
|
|
# which we don't want
|
|
and not capa.features.extractors.ida.helpers.has_sib(op)
|
|
):
|
|
# for pattern like:
|
|
#
|
|
# lea eax, [ebx + 1]
|
|
#
|
|
# assume 1 is also an offset (imagine ebx is a zero register).
|
|
yield Number(op_off), ih.address
|
|
yield OperandNumber(i, op_off), ih.address
|
|
|
|
|
|
def contains_stack_cookie_keywords(s: str) -> bool:
|
|
"""
|
|
check if string contains stack cookie keywords
|
|
|
|
Examples:
|
|
xor ecx, ebp ; StackCookie
|
|
mov eax, ___security_cookie
|
|
"""
|
|
if not s:
|
|
return False
|
|
s = s.strip().lower()
|
|
if "cookie" not in s:
|
|
return False
|
|
return any(keyword in s for keyword in ("stack", "security"))
|
|
|
|
|
|
def bb_stack_cookie_registers(bb: idaapi.BasicBlock) -> Iterator[int]:
|
|
"""scan basic block for stack cookie operations
|
|
|
|
yield registers ids that may have been used for stack cookie operations
|
|
|
|
assume instruction that sets stack cookie and nzxor exist in same block
|
|
and stack cookie register is not modified prior to nzxor
|
|
|
|
Example:
|
|
.text:004062DA mov eax, ___security_cookie <-- stack cookie
|
|
.text:004062DF mov ecx, eax
|
|
.text:004062E1 mov ebx, [esi]
|
|
.text:004062E3 and ecx, 1Fh
|
|
.text:004062E6 mov edi, [esi+4]
|
|
.text:004062E9 xor ebx, eax
|
|
.text:004062EB mov esi, [esi+8]
|
|
.text:004062EE xor edi, eax <-- ignore
|
|
.text:004062F0 xor esi, eax <-- ignore
|
|
.text:004062F2 ror edi, cl
|
|
.text:004062F4 ror esi, cl
|
|
.text:004062F6 ror ebx, cl
|
|
.text:004062F8 cmp edi, esi
|
|
.text:004062FA jnz loc_40639D
|
|
|
|
TODO: this is expensive, but necessary?...
|
|
"""
|
|
for insn in capa.features.extractors.ida.helpers.get_instructions_in_range(bb.start_ea, bb.end_ea):
|
|
if contains_stack_cookie_keywords(idc.GetDisasm(insn.ea)):
|
|
for op in capa.features.extractors.ida.helpers.get_insn_ops(insn, target_ops=(idaapi.o_reg,)):
|
|
if capa.features.extractors.ida.helpers.is_op_write(insn, op):
|
|
# only include modified registers
|
|
yield op.reg
|
|
|
|
|
|
def is_nzxor_stack_cookie_delta(f: idaapi.func_t, bb: idaapi.BasicBlock, insn: idaapi.insn_t) -> bool:
|
|
"""check if nzxor exists within stack cookie delta"""
|
|
# security cookie check should use SP or BP
|
|
if not capa.features.extractors.ida.helpers.is_frame_register(insn.Op2.reg):
|
|
return False
|
|
|
|
f_bbs = tuple(capa.features.extractors.ida.helpers.get_function_blocks(f))
|
|
|
|
# expect security cookie init in first basic block within first bytes (instructions)
|
|
if capa.features.extractors.ida.helpers.is_basic_block_equal(bb, f_bbs[0]) and insn.ea < (
|
|
bb.start_ea + SECURITY_COOKIE_BYTES_DELTA
|
|
):
|
|
return True
|
|
|
|
# ... or within last bytes (instructions) before a return
|
|
if capa.features.extractors.ida.helpers.is_basic_block_return(bb) and insn.ea > (
|
|
bb.start_ea + capa.features.extractors.ida.helpers.basic_block_size(bb) - SECURITY_COOKIE_BYTES_DELTA
|
|
):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def is_nzxor_stack_cookie(f: idaapi.func_t, bb: idaapi.BasicBlock, insn: idaapi.insn_t) -> bool:
|
|
"""check if nzxor is related to stack cookie"""
|
|
if contains_stack_cookie_keywords(idaapi.get_cmt(insn.ea, False)):
|
|
# Example:
|
|
# xor ecx, ebp ; StackCookie
|
|
return True
|
|
if is_nzxor_stack_cookie_delta(f, bb, insn):
|
|
return True
|
|
stack_cookie_regs = tuple(bb_stack_cookie_registers(bb))
|
|
if any(op_reg in stack_cookie_regs for op_reg in (insn.Op1.reg, insn.Op2.reg)):
|
|
# Example:
|
|
# mov eax, ___security_cookie
|
|
# xor eax, ebp
|
|
return True
|
|
return False
|
|
|
|
|
|
def extract_insn_nzxor_characteristic_features(
|
|
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
|
) -> Iterator[Tuple[Feature, Address]]:
|
|
"""
|
|
parse instruction non-zeroing XOR instruction
|
|
ignore expected non-zeroing XORs, e.g. security cookies
|
|
"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
if insn.itype not in (idaapi.NN_xor, idaapi.NN_xorpd, idaapi.NN_xorps, idaapi.NN_pxor):
|
|
return
|
|
if capa.features.extractors.ida.helpers.is_operand_equal(insn.Op1, insn.Op2):
|
|
return
|
|
if is_nzxor_stack_cookie(fh.inner, bbh.inner, insn):
|
|
return
|
|
yield Characteristic("nzxor"), ih.address
|
|
|
|
|
|
def extract_insn_mnemonic_features(
|
|
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
|
) -> Iterator[Tuple[Feature, Address]]:
|
|
"""parse instruction mnemonic features"""
|
|
yield Mnemonic(idc.print_insn_mnem(ih.inner.ea)), ih.address
|
|
|
|
|
|
def extract_insn_obfs_call_plus_5_characteristic_features(
|
|
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
|
) -> Iterator[Tuple[Feature, Address]]:
|
|
"""
|
|
parse call $+5 instruction from the given instruction.
|
|
"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
if not idaapi.is_call_insn(insn):
|
|
return
|
|
|
|
if insn.ea + 5 == idc.get_operand_value(insn.ea, 0):
|
|
yield Characteristic("call $+5"), ih.address
|
|
|
|
|
|
def extract_insn_peb_access_characteristic_features(
|
|
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
|
) -> Iterator[Tuple[Feature, Address]]:
|
|
"""parse instruction peb access
|
|
|
|
fs:[0x30] on x86, gs:[0x60] on x64
|
|
|
|
TODO:
|
|
IDA should be able to do this..
|
|
"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
if insn.itype not in (idaapi.NN_push, idaapi.NN_mov):
|
|
return
|
|
|
|
if all(op.type != idaapi.o_mem for op in insn.ops):
|
|
# try to optimize for only memory references
|
|
return
|
|
|
|
disasm = idc.GetDisasm(insn.ea)
|
|
|
|
if " fs:30h" in disasm or " gs:60h" in disasm:
|
|
# TODO(mike-hunhoff): use proper IDA API for fetching segment access
|
|
# scanning the disassembly text is a hack.
|
|
# https://github.com/mandiant/capa/issues/1605
|
|
yield Characteristic("peb access"), ih.address
|
|
|
|
|
|
def extract_insn_segment_access_features(
|
|
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
|
) -> Iterator[Tuple[Feature, Address]]:
|
|
"""parse instruction fs or gs access
|
|
|
|
TODO:
|
|
IDA should be able to do this...
|
|
"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
if all(op.type != idaapi.o_mem for op in insn.ops):
|
|
# try to optimize for only memory references
|
|
return
|
|
|
|
disasm = idc.GetDisasm(insn.ea)
|
|
|
|
if " fs:" in disasm:
|
|
# TODO(mike-hunhoff): use proper IDA API for fetching segment access
|
|
# scanning the disassembly text is a hack.
|
|
# https://github.com/mandiant/capa/issues/1605
|
|
yield Characteristic("fs access"), ih.address
|
|
|
|
if " gs:" in disasm:
|
|
# TODO(mike-hunhoff): use proper IDA API for fetching segment access
|
|
# scanning the disassembly text is a hack.
|
|
# https://github.com/mandiant/capa/issues/1605
|
|
yield Characteristic("gs access"), ih.address
|
|
|
|
|
|
def extract_insn_cross_section_cflow(
|
|
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
|
) -> Iterator[Tuple[Feature, Address]]:
|
|
"""inspect the instruction for a CALL or JMP that crosses section boundaries"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
for ref in idautils.CodeRefsFrom(insn.ea, False):
|
|
if ref in get_imports(fh.ctx):
|
|
# ignore API calls
|
|
continue
|
|
if not idaapi.getseg(ref):
|
|
# handle IDA API bug
|
|
continue
|
|
if idaapi.getseg(ref) == idaapi.getseg(insn.ea):
|
|
continue
|
|
yield Characteristic("cross section flow"), ih.address
|
|
|
|
|
|
def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
|
|
"""extract functions calls from features
|
|
|
|
most relevant at the function scope, however, its most efficient to extract at the instruction scope
|
|
"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
if idaapi.is_call_insn(insn):
|
|
for ref in idautils.CodeRefsFrom(insn.ea, False):
|
|
yield Characteristic("calls from"), AbsoluteVirtualAddress(ref)
|
|
|
|
|
|
def extract_function_indirect_call_characteristic_features(
|
|
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
|
) -> Iterator[Tuple[Feature, Address]]:
|
|
"""extract indirect function calls (e.g., call eax or call dword ptr [edx+4])
|
|
does not include calls like => call ds:dword_ABD4974
|
|
|
|
most relevant at the function or basic block scope;
|
|
however, its most efficient to extract at the instruction scope
|
|
"""
|
|
insn: idaapi.insn_t = ih.inner
|
|
|
|
if idaapi.is_call_insn(insn) and idc.get_operand_type(insn.ea, 0) in (idc.o_reg, idc.o_phrase, idc.o_displ):
|
|
yield Characteristic("indirect call"), ih.address
|
|
|
|
|
|
def extract_features(f: FunctionHandle, bbh: BBHandle, insn: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
|
|
"""extract instruction features"""
|
|
for inst_handler in INSTRUCTION_HANDLERS:
|
|
for feature, ea in inst_handler(f, bbh, insn):
|
|
yield feature, ea
|
|
|
|
|
|
INSTRUCTION_HANDLERS = (
|
|
extract_insn_api_features,
|
|
extract_insn_number_features,
|
|
extract_insn_bytes_features,
|
|
extract_insn_string_features,
|
|
extract_insn_offset_features,
|
|
extract_insn_nzxor_characteristic_features,
|
|
extract_insn_mnemonic_features,
|
|
extract_insn_obfs_call_plus_5_characteristic_features,
|
|
extract_insn_peb_access_characteristic_features,
|
|
extract_insn_cross_section_cflow,
|
|
extract_insn_segment_access_features,
|
|
extract_function_calls_from,
|
|
extract_function_indirect_call_characteristic_features,
|
|
)
|