Files
capa/capa/features/extractors/viv/insn.py
Ana María Martínez Gómez 64124c0b64 Remove True from Characteristic rules and output
Get rid of `True` in characteristic (rules, output and json) as it is
implicit. This way, the same syntax is used for characteristic as for
the rest of the features.

Co-authored-by: William Ballenthin <william.ballenthin@fireeye.com>
2020-07-02 16:50:15 +02:00

466 lines
15 KiB
Python

import envi.memory
import envi.archs.i386.disasm
import vivisect.const
from capa.features import String
from capa.features import Bytes
from capa.features import Characteristic
from capa.features import MAX_BYTES_FEATURE_SIZE
from capa.features.insn import Number
from capa.features.insn import Offset
from capa.features.insn import Mnemonic
import capa.features.extractors.helpers
from capa.features.extractors.viv.indirect_calls import NotFoundError
from capa.features.extractors.viv.indirect_calls import resolve_indirect_call
def interface_extract_instruction_XXX(f, bb, insn):
'''
parse features from the given instruction.
args:
f (viv_utils.Function): the function to process.
bb (viv_utils.BasicBlock): the basic block to process.
insn (vivisect...Instruction): the instruction to process.
yields:
(Feature, int): the feature and the address at which its found.
'''
yield NotImplementedError('feature'), NotImplementedError('virtual address')
def get_imports(vw):
'''
caching accessor to vivisect workspace imports
avoids performance issues in vivisect when collecting locations
'''
if 'imports' in vw.metadata:
return vw.metadata['imports']
else:
imports = {p[0]: p[3] for p in vw.getImports()}
vw.metadata['imports'] = imports
return imports
def extract_insn_api_features(f, bb, insn):
'''parse API features from the given instruction.'''
# example:
#
# call dword [0x00473038]
if insn.mnem != 'call':
return
# traditional call via IAT
if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper):
oper = insn.opers[0]
target = oper.getOperAddr(insn)
imports = get_imports(f.vw)
if target in imports.keys():
for feature, va in capa.features.extractors.helpers.generate_api_features(imports[target], insn.va):
yield feature, va
# call via thunk on x86,
# see 9324d1a8ae37a36ae560c37448c9705a at 0x407985
#
# this is also how calls to internal functions may be decoded on x64.
# see Lab21-01.exe_:0x140001178
elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386PcRelOper):
target = insn.opers[0].getOperValue(insn)
try:
thunk = f.vw.getFunctionMeta(target, 'Thunk')
except vivisect.exc.InvalidFunction:
return
else:
if thunk:
for feature, va in capa.features.extractors.helpers.generate_api_features(thunk, insn.va):
yield feature, va
# call via import on x64
# see Lab21-01.exe_:0x14000118C
elif isinstance(insn.opers[0], envi.archs.amd64.disasm.Amd64RipRelOper):
op = insn.opers[0]
target = op.getOperAddr(insn)
imports = get_imports(f.vw)
if target in imports.keys():
for feature, va in capa.features.extractors.helpers.generate_api_features(imports[target], insn.va):
yield feature, va
elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper):
try:
(_, target) = resolve_indirect_call(f.vw, insn.va, insn=insn)
except NotFoundError:
# not able to resolve the indirect call, sorry
return
if target is None:
# not able to resolve the indirect call, sorry
return
imports = get_imports(f.vw)
if target in imports.keys():
for feature, va in capa.features.extractors.helpers.generate_api_features(imports[target], insn.va):
yield feature, va
def extract_insn_number_features(f, bb, insn):
'''parse number features from the given instruction.'''
# example:
#
# push 3136B0h ; dwControlCode
for oper in insn.opers:
# this is for both x32 and x64
if not isinstance(oper, envi.archs.i386.disasm.i386ImmOper):
continue
v = oper.getOperValue(oper)
if f.vw.probeMemory(v, 1, envi.memory.MM_READ):
# this is a valid address
# assume its not also a constant.
continue
if insn.mnem == 'add' \
and insn.opers[0].isReg() \
and insn.opers[0].reg == envi.archs.i386.disasm.REG_ESP:
# skip things like:
#
# .text:00401140 call sub_407E2B
# .text:00401145 add esp, 0Ch
return
yield Number(v), insn.va
def extract_insn_bytes_features(f, bb, insn):
'''
parse byte sequence features from the given instruction.
example:
# push offset iid_004118d4_IShellLinkA ; riid
'''
for oper in insn.opers:
if insn.mnem == 'call':
# ignore call instructions
continue
if isinstance(oper, envi.archs.i386.disasm.i386ImmOper):
v = oper.getOperValue(oper)
elif isinstance(oper, envi.archs.i386.disasm.i386RegMemOper):
# handle case like:
# movzx ecx, ds:byte_423258[eax]
v = oper.disp
elif isinstance(oper, envi.archs.amd64.disasm.Amd64RipRelOper):
# see: Lab21-01.exe_:0x1400010D3
v = oper.getOperAddr(insn)
else:
continue
segm = f.vw.getSegment(v)
if not segm:
continue
segm_end = segm[0] + segm[1]
try:
# Do not read beyond the end of a segment
if v + MAX_BYTES_FEATURE_SIZE > segm_end:
extracted_bytes = f.vw.readMemory(v, segm_end - v)
else:
extracted_bytes = f.vw.readMemory(v, MAX_BYTES_FEATURE_SIZE)
except envi.SegmentationViolation:
pass
else:
if not capa.features.extractors.helpers.all_zeros(extracted_bytes):
yield Bytes(extracted_bytes), insn.va
def read_string(vw, offset):
try:
alen = vw.detectString(offset)
except envi.SegmentationViolation:
pass
else:
if alen > 0:
return vw.readMemory(offset, alen).decode('utf-8')
try:
ulen = vw.detectUnicode(offset)
except envi.SegmentationViolation:
pass
except IndexError:
# potential vivisect bug detecting Unicode at segment end
pass
else:
if ulen > 0:
if ulen % 2 == 1:
# vivisect seems to mis-detect the end unicode strings
# off by one, too short
ulen += 1
return vw.readMemory(offset, ulen).decode('utf-16')
raise ValueError('not a string', offset)
def extract_insn_string_features(f, bb, insn):
'''parse string features from the given instruction.'''
# example:
#
# push offset aAcr ; "ACR > "
for oper in insn.opers:
if isinstance(oper, envi.archs.i386.disasm.i386ImmOper):
v = oper.getOperValue(oper)
elif isinstance(oper, envi.archs.amd64.disasm.Amd64RipRelOper):
v = oper.getOperAddr(insn)
else:
continue
try:
s = read_string(f.vw, v)
except ValueError:
continue
else:
yield String(s.rstrip('\x00')), insn.va
def extract_insn_offset_features(f, bb, insn):
'''parse structure offset features from the given instruction.'''
# example:
#
# .text:0040112F cmp [esi+4], ebx
for oper in insn.opers:
# this is for both x32 and x64
if not isinstance(oper, envi.archs.i386.disasm.i386RegMemOper):
continue
if oper.reg == envi.archs.i386.disasm.REG_ESP:
continue
if oper.reg == envi.archs.i386.disasm.REG_EBP:
continue
# TODO: do x64 support for real.
if oper.reg == envi.archs.amd64.disasm.REG_RBP:
continue
yield Offset(oper.disp), insn.va
def is_security_cookie(f, bb, insn):
'''
check if an instruction is related to security cookie checks
'''
# security cookie check should use SP or BP
oper = insn.opers[1]
if oper.isReg() \
and oper.reg not in [envi.archs.i386.disasm.REG_ESP, envi.archs.i386.disasm.REG_EBP,
# TODO: do x64 support for real.
envi.archs.amd64.disasm.REG_RBP, envi.archs.amd64.disasm.REG_RSP]:
return False
# expect security cookie init in first basic block within first bytes (instructions)
bb0 = f.basic_blocks[0]
if bb == bb0 and insn.va < bb.va + 30:
return True
# ... or within last bytes (instructions) before a return
elif bb.instructions[-1].isReturn() and insn.va > bb.va + bb.size - 30:
return True
return False
def extract_insn_nzxor_characteristic_features(f, bb, insn):
'''
parse non-zeroing XOR instruction from the given instruction.
ignore expected non-zeroing XORs, e.g. security cookies.
'''
if insn.mnem != 'xor':
return
if insn.opers[0] == insn.opers[1]:
return
if is_security_cookie(f, bb, insn):
return
yield Characteristic('nzxor'), insn.va
def extract_insn_mnemonic_features(f, bb, insn):
'''parse mnemonic features from the given instruction.'''
yield Mnemonic(insn.mnem), insn.va
def extract_insn_peb_access_characteristic_features(f, bb, insn):
'''
parse peb access from the given function. fs:[0x30] on x86, gs:[0x60] on x64
'''
# TODO handle where fs/gs are loaded into a register or onto the stack and used later
if insn.mnem not in ['push', 'mov']:
return
if 'fs' in insn.getPrefixName():
for oper in insn.opers:
# examples
#
# IDA: mov eax, large fs:30h
# viv: fs: mov eax,dword [0x00000030] ; i386ImmMemOper
# IDA: push large dword ptr fs:30h
# viv: fs: push dword [0x00000030]
# fs: push dword [eax + 0x30] ; i386RegMemOper, with eax = 0
if (isinstance(oper, envi.archs.i386.disasm.i386RegMemOper) and oper.disp == 0x30) or \
(isinstance(oper, envi.archs.i386.disasm.i386ImmMemOper) and oper.imm == 0x30):
yield Characteristic('peb access'), insn.va
elif 'gs' in insn.getPrefixName():
for oper in insn.opers:
if (isinstance(oper, envi.archs.amd64.disasm.i386RegMemOper) and oper.disp == 0x60) or \
(isinstance(oper, envi.archs.amd64.disasm.i386ImmMemOper) and oper.imm == 0x60):
yield Characteristic('peb access'), insn.va
else:
pass
def extract_insn_segment_access_features(f, bb, insn):
''' parse the instruction for access to fs or gs '''
prefix = insn.getPrefixName()
if prefix == 'fs':
yield Characteristic('fs access'), insn.va
if prefix == 'gs':
yield Characteristic('gs access'), insn.va
def get_section(vw, va):
for start, length, _, __ in vw.getMemoryMaps():
if start <= va < start + length:
return start
raise KeyError(va)
def extract_insn_cross_section_cflow(f, bb, insn):
'''
inspect the instruction for a CALL or JMP that crosses section boundaries.
'''
for va, flags in insn.getBranches():
if flags & envi.BR_FALL:
continue
try:
# skip 32-bit calls to imports
if insn.mnem == 'call' and isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper):
oper = insn.opers[0]
target = oper.getOperAddr(insn)
if target in get_imports(f.vw):
continue
# skip 64-bit calls to imports
elif insn.mnem == 'call' and isinstance(insn.opers[0], envi.archs.amd64.disasm.Amd64RipRelOper):
op = insn.opers[0]
target = op.getOperAddr(insn)
if target in get_imports(f.vw):
continue
if get_section(f.vw, insn.va) != get_section(f.vw, va):
yield Characteristic('cross section flow'), insn.va
except KeyError:
continue
# this is a feature that's most relevant at the function scope,
# however, its most efficient to extract at the instruction scope.
def extract_function_calls_from(f, bb, insn):
if insn.mnem != 'call':
return
target = None
# traditional call via IAT, x32
if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper):
oper = insn.opers[0]
target = oper.getOperAddr(insn)
yield Characteristic('calls from'), target
# call via thunk on x86,
# see 9324d1a8ae37a36ae560c37448c9705a at 0x407985
#
# call to internal function on x64
# see Lab21-01.exe_:0x140001178
elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386PcRelOper):
target = insn.opers[0].getOperValue(insn)
yield Characteristic('calls from'), target
# call via IAT, x64
elif isinstance(insn.opers[0], envi.archs.amd64.disasm.Amd64RipRelOper):
op = insn.opers[0]
target = op.getOperAddr(insn)
yield Characteristic('calls from'), target
if target and target == f.va:
# if we found a jump target and it's the function address
# mark as recursive
yield Characteristic('recursive call'), target
# this is a feature that's most relevant at the function or basic block scope,
# however, its most efficient to extract at the instruction scope.
def extract_function_indirect_call_characteristic_features(f, bb, insn):
'''
extract indirect function call characteristic (e.g., call eax or call dword ptr [edx+4])
does not include calls like => call ds:dword_ABD4974
'''
if insn.mnem != 'call':
return
# Checks below work for x86 and x64
if isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper):
# call edx
yield Characteristic('indirect call'), insn.va
elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegMemOper):
# call dword ptr [eax+50h]
yield Characteristic('indirect call'), insn.va
elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386SibOper):
# call qword ptr [rsp+78h]
yield Characteristic('indirect call'), insn.va
def extract_features(f, bb, insn):
'''
extract features from the given insn.
args:
f (viv_utils.Function): the function from which to extract features
bb (viv_utils.BasicBlock): the basic block to process.
insn (vivisect...Instruction): the instruction to process.
yields:
Feature, set[VA]: the features and their location found in this insn.
'''
for insn_handler in INSTRUCTION_HANDLERS:
for feature, va in insn_handler(f, bb, insn):
yield feature, va
INSTRUCTION_HANDLERS = (
extract_insn_api_features,
extract_insn_number_features,
extract_insn_string_features,
extract_insn_bytes_features,
extract_insn_offset_features,
extract_insn_nzxor_characteristic_features,
extract_insn_mnemonic_features,
extract_insn_peb_access_characteristic_features,
extract_insn_cross_section_cflow,
extract_insn_segment_access_features,
extract_function_calls_from,
extract_function_indirect_call_characteristic_features
)