Merge pull request #39 from fireeye/ana-description

This commit is contained in:
Ana María Martínez Gómez
2020-07-02 17:07:04 +02:00
committed by GitHub
21 changed files with 256 additions and 253 deletions

View File

@@ -84,6 +84,7 @@ Download capa from the [Releases](/releases) page or get the nightly builds here
- [section](#section)
- [counting](#counting)
- [matching prior rule matches](#matching-prior-rule-matches)
- [descriptions](#descriptions)
- [limitations](#Limitations)
# installation
@@ -339,8 +340,9 @@ For example, a crypto constant.
The parameter is a number; if prefixed with `0x` then in hex format, otherwise, decimal format.
To associate context with a number, e.g. for constant definitions, append an equal sign and the respective name to
the number definition. This helps with documenting rules and provides context in capa's output.
To help humans understand the meaning of a number, such that the constant `0x40` means `PAGE_EXECUTE_READWRITE`, you may provide a description alongside the definition.
Use the inline syntax (preferred) by ending the line with ` = DESCRIPTION STRING`.
Check the [description section](#description) for more details.
Examples:
@@ -362,20 +364,31 @@ Regexes should be surrounded with `/` characters.
By default, capa uses case-sensitive matching and assumes leading and trailing wildcards.
To perform case-insensitive matching append an `i`. To anchor the regex at the start or end of a string, use `^` and/or `$`.
To add context to a string use the two-line syntax, using the `description` tag: `description: DESCRIPTION STRING`.
The inline syntax is not supported.
Check the [description section](#description) for more details.
Examples:
string: This program cannot be run in DOS mode.
string: Firefox 64.0
string: /SELECT.*FROM.*WHERE/
string: /Hardware\\Description\\System\\CentralProcessor/i
```
- string: This program cannot be run in DOS mode.
description: MS-DOS stub message
- string: '{3E5FC7F9-9A51-4367-9063-A120244FBEC7}'
description: CLSID_CMSTPLUA
- string: Firefox 64.0
- string:'/SELECT.*FROM.*WHERE/
- string: /Hardware\\Description\\System\\CentralProcessor/i
```
Note that regex matching is expensive (`O(features)` rather than `O(1)`) so they should be used sparingly.
### bytes
A sequence of bytes referenced by the logic of the program.
The provided sequence must match from the beginning of the referenced bytes and be no more than `0x100` bytes.
The parameter is a sequence of hexadecimal bytes followed by an optional description.
The parameter is a sequence of hexadecimal bytes.
To help humans understand the meaning of the bytes sequence, you may provide a description.
Use the inline syntax (preferred) by ending the line with ` = DESCRIPTION STRING`.
Check the [description section](#description) for more details.
The example below illustrates byte matching given a COM CLSID pushed onto the stack prior to `CoCreateInstance`.
@@ -397,6 +410,7 @@ A structure offset referenced by the logic of the program.
This should not be a stack offset.
The parameter is a number; if prefixed with `0x` then in hex format, otherwise, decimal format.
It can be followed by an optional description.
Examples:
@@ -453,6 +467,7 @@ These are the features supported at the file-scope:
- [import](#import)
- [section](#section)
### file string
An ASCII or UTF-16 LE string present in the file.
@@ -511,6 +526,10 @@ These rules can be expressed like:
count(mnemonic(mov)): 3
count(basic block): 4
`count` supports inline descriptions, except for [strings](#string), using the following syntax:
count(number(2 = AF_INET/SOCK_DGRAM)): 2
## matching prior rule matches
capa rules can specify logic for matching on other rule matches.
@@ -532,6 +551,28 @@ By default, library rules will not be output to the user as a rule match,
but can be matched by other rules.
When no active rules depend on a library rule, these the library rules will not be evaluated - maintaining performance.
## description
All features support an optional description which helps with documenting rules and provides context in capa's output.
For all features except for [strings](#string), the description can be specified inline preceded by ` = `: ` = DESCRIPTION STRING`.
For example:
```
- number: 0x4550 = IMAGE_DOS_SIGNATURE (MZ)
```
The inline syntax is preferred.
For [strings](#string) or if the description is long or contains newlines, use the two-line syntax.
It uses the `description` tag in the following way: `description: DESCRIPTION STRING`
For example:
```
- string: This program cannot be run in DOS mode.
description: MS-DOS stub message
- number: 0x4550
description: IMAGE_DOS_SIGNATURE (MZ)
```
# limitations
To learn more about capa's current limitations see [here](doc/limitations.md).

View File

@@ -17,10 +17,11 @@ def bytes_to_str(b):
class Feature(object):
def __init__(self, args):
def __init__(self, args, description=None):
super(Feature, self).__init__()
self.name = self.__class__.__name__
self.name = self.__class__.__name__.lower()
self.args = args
self.description = description
def __hash__(self):
return hash((self.name, tuple(self.args)))
@@ -28,8 +29,16 @@ class Feature(object):
def __eq__(self, other):
return self.name == other.name and self.args == other.args
# Used to overwrite the rendering of the feature args in `__str__` and the
# json output
def get_args_str(self):
return ','.join(self.args)
def __str__(self):
return '%s(%s)' % (self.name.lower(), ','.join(self.args))
if self.description:
return '%s(%s = %s)' % (self.name, self.get_args_str(), self.description)
else:
return '%s(%s)' % (self.name, self.get_args_str())
def __repr__(self):
return str(self)
@@ -50,51 +59,41 @@ class Feature(object):
class MatchedRule(Feature):
def __init__(self, rule_name):
super(MatchedRule, self).__init__([rule_name])
def __init__(self, rule_name, description=None):
super(MatchedRule, self).__init__([rule_name], description)
self.name = 'match'
self.rule_name = rule_name
def __str__(self):
return 'match(%s)' % (self.rule_name)
class Characteristic(Feature):
def __init__(self, name, value=None):
'''
when `value` is not provided, this serves as descriptor for a class of characteristics.
this is only used internally, such as in `rules.py` when checking if a statement is
supported by a given scope.
'''
super(Characteristic, self).__init__([name, value])
self.name = name
def __init__(self, value, description=None):
super(Characteristic, self).__init__([value], description)
self.value = value
def evaluate(self, ctx):
if self.value is None:
raise ValueError('cannot evaluate characteristc %s with empty value' % (str(self)))
return super(Characteristic, self).evaluate(ctx)
def freeze_serialize(self):
# in an older version of capa, characteristics could theoretically match non-existence (value=False).
# but we found this was never used (and better expressed with `not: characteristic: ...`).
# this was represented using an additional parameter for Characteristic.
# its been removed, but we keep it around in the freeze format to maintain backwards compatibility.
# this value is ignored, however.
return (self.__class__.__name__, [self.value, True])
def __str__(self):
if self.value is None:
return 'characteristic(%s)' % (self.name)
else:
return 'characteristic(%s(%s))' % (self.name, self.value)
@classmethod
def freeze_deserialize(cls, args):
# see above. we ignore the second element in the 2-tuple here.
return cls(args[0])
class String(Feature):
def __init__(self, value):
super(String, self).__init__([value])
def __init__(self, value, description=None):
super(String, self).__init__([value], description)
self.value = value
def __str__(self):
return 'string("%s")' % (self.value)
class Bytes(Feature):
def __init__(self, value, symbol=None):
super(Bytes, self).__init__([value])
def __init__(self, value, description=None):
super(Bytes, self).__init__([value], description)
self.value = value
self.symbol = symbol
def evaluate(self, ctx):
for feature, locations in ctx.items():
@@ -106,11 +105,8 @@ class Bytes(Feature):
return capa.engine.Result(False, self, [])
def __str__(self):
if self.symbol:
return 'bytes(0x%s = %s)' % (bytes_to_str(self.value).upper(), self.symbol)
else:
return 'bytes(0x%s)' % (bytes_to_str(self.value).upper())
def get_args_str(self):
return bytes_to_str(self.value).upper()
def freeze_serialize(self):
return (self.__class__.__name__,

View File

@@ -184,22 +184,22 @@ class NullFeatureExtractor(FeatureExtractor):
extractor = NullFeatureExtractor({
'file features': [
(0x402345, capa.features.Characteristic('embedded pe', True)),
(0x402345, capa.features.Characteristic('embedded pe')),
],
'functions': {
0x401000: {
'features': [
(0x401000, capa.features.Characteristic('switch', True)),
(0x401000, capa.features.Characteristic('switch')),
],
'basic blocks': {
0x401000: {
'features': [
(0x401000, capa.features.Characteristic('tight-loop', True)),
(0x401000, capa.features.Characteristic('tight-loop')),
],
'instructions': {
0x401000: {
'features': [
(0x401000, capa.features.Characteristic('nzxor', True)),
(0x401000, capa.features.Characteristic('nzxor')),
],
},
0x401002: ...

View File

@@ -103,7 +103,7 @@ def extract_bb_stackstring(f, bb):
bb (IDA BasicBlock)
'''
if _ida_bb_contains_stackstring(f, bb):
yield Characteristic('stack string', True), bb.start_ea
yield Characteristic('stack string'), bb.start_ea
def _ida_bb_contains_tight_loop(f, bb):
@@ -133,7 +133,7 @@ def extract_bb_tight_loop(f, bb):
bb (IDA BasicBlock)
'''
if _ida_bb_contains_tight_loop(f, bb):
yield Characteristic('tight loop', True), bb.start_ea
yield Characteristic('tight loop'), bb.start_ea
def extract_features(f, bb):

View File

@@ -68,7 +68,7 @@ def extract_file_embedded_pe():
continue
for ea, _ in _ida_check_segment_for_pe(seg):
yield Characteristic('embedded pe', True), ea
yield Characteristic('embedded pe'), ea
def extract_file_export_names():

View File

@@ -29,7 +29,7 @@ def extract_function_switch(f):
f (IDA func_t)
'''
if _ida_function_contains_switch(f):
yield Characteristic('switch', True), f.start_ea
yield Characteristic('switch'), f.start_ea
def extract_function_calls_to(f):
@@ -39,7 +39,7 @@ def extract_function_calls_to(f):
f (IDA func_t)
'''
for ea in idautils.CodeRefsTo(f.start_ea, True):
yield Characteristic('calls to', True), ea
yield Characteristic('calls to'), ea
def extract_function_loop(f):
@@ -53,7 +53,7 @@ def extract_function_loop(f):
map(lambda s: edges.append((bb.start_ea, s.start_ea)), bb.succs())
if edges and loops.has_loop(edges):
yield Characteristic('loop', True), f.start_ea
yield Characteristic('loop'), f.start_ea
def extract_recursive_call(f):
@@ -64,7 +64,7 @@ def extract_recursive_call(f):
'''
for ref in idautils.CodeRefsTo(f.start_ea, True):
if f.contains(ref):
yield Characteristic('recursive call', True), f.start_ea
yield Characteristic('recursive call'), f.start_ea
break

View File

@@ -259,7 +259,7 @@ def extract_insn_nzxor_characteristic_features(f, bb, insn):
if _is_nzxor_stack_cookie(f, bb, insn):
return
yield Characteristic('nzxor', True), insn.ea
yield Characteristic('nzxor'), insn.ea
def extract_insn_mnemonic_features(f, bb, insn):
@@ -292,7 +292,7 @@ def extract_insn_peb_access_characteristic_features(f, bb, insn):
if ' fs:30h' in disasm or ' gs:60h' in disasm:
# TODO: replace above with proper IDA
yield Characteristic('peb access', True), insn.ea
yield Characteristic('peb access'), insn.ea
def extract_insn_segment_access_features(f, bb, insn):
@@ -309,11 +309,11 @@ def extract_insn_segment_access_features(f, bb, insn):
if ' fs:' in disasm:
# TODO: replace above with proper IDA
yield Characteristic('fs access', True), insn.ea
yield Characteristic('fs access'), insn.ea
if ' gs:' in disasm:
# TODO: replace above with proper IDA
yield Characteristic('gs access', True), insn.ea
yield Characteristic('gs access'), insn.ea
def extract_insn_cross_section_cflow(f, bb, insn):
@@ -336,7 +336,7 @@ def extract_insn_cross_section_cflow(f, bb, insn):
if idaapi.getseg(ref) == idaapi.getseg(insn.ea):
continue
yield Characteristic('cross section flow', True), insn.ea
yield Characteristic('cross section flow'), insn.ea
def extract_function_calls_from(f, bb, insn):
@@ -354,7 +354,7 @@ def extract_function_calls_from(f, bb, insn):
return
for ref in idautils.CodeRefsFrom(insn.ea, False):
yield Characteristic('calls from', True), ref
yield Characteristic('calls from'), ref
def extract_function_indirect_call_characteristic_features(f, bb, insn):
@@ -373,7 +373,7 @@ def extract_function_indirect_call_characteristic_features(f, bb, insn):
return
if idc.get_operand_type(insn.ea, 0) in (idc.o_reg, idc.o_phrase, idc.o_displ):
yield Characteristic('indirect call', True), insn.ea
yield Characteristic('indirect call'), insn.ea
def extract_features(f, bb, insn):

View File

@@ -39,7 +39,7 @@ def _bb_has_tight_loop(f, bb):
def extract_bb_tight_loop(f, bb):
''' check basic block for tight loop indicators '''
if _bb_has_tight_loop(f, bb):
yield Characteristic('tight loop', True), bb.va
yield Characteristic('tight loop'), bb.va
def _bb_has_stackstring(f, bb):
@@ -62,7 +62,7 @@ def _bb_has_stackstring(f, bb):
def extract_stackstring(f, bb):
''' check basic block for stackstring indicators '''
if _bb_has_stackstring(f, bb):
yield Characteristic('stack string', True), bb.va
yield Characteristic('stack string'), bb.va
def is_mov_imm_to_stack(instr):

View File

@@ -13,7 +13,7 @@ def extract_file_embedded_pe(vw, file_path):
fbytes = f.read()
for offset, i in pe_carve.carve(fbytes, 1):
yield Characteristic('embedded pe', True), offset
yield Characteristic('embedded pe'), offset
def extract_file_export_names(vw, file_path):

View File

@@ -53,12 +53,12 @@ def extract_function_switch(f):
method can be optimized
'''
if f.va in get_functions_with_switch(f.vw):
yield Characteristic('switch', True), f.va
yield Characteristic('switch'), f.va
def extract_function_calls_to(f):
for src, _, _, _ in f.vw.getXrefsTo(f.va, rtype=vivisect.const.REF_CODE):
yield Characteristic('calls to', True), src
yield Characteristic('calls to'), src
def extract_function_loop(f):
@@ -74,7 +74,7 @@ def extract_function_loop(f):
edges.append((bb.va, bva))
if edges and loops.has_loop(edges):
yield Characteristic('loop', True), f.va
yield Characteristic('loop'), f.va
def extract_features(f):

View File

@@ -286,7 +286,7 @@ def extract_insn_nzxor_characteristic_features(f, bb, insn):
if is_security_cookie(f, bb, insn):
return
yield Characteristic('nzxor', True), insn.va
yield Characteristic('nzxor'), insn.va
def extract_insn_mnemonic_features(f, bb, insn):
@@ -314,12 +314,12 @@ def extract_insn_peb_access_characteristic_features(f, bb, insn):
# fs: push dword [eax + 0x30] ; i386RegMemOper, with eax = 0
if (isinstance(oper, envi.archs.i386.disasm.i386RegMemOper) and oper.disp == 0x30) or \
(isinstance(oper, envi.archs.i386.disasm.i386ImmMemOper) and oper.imm == 0x30):
yield Characteristic('peb access', True), insn.va
yield Characteristic('peb access'), insn.va
elif 'gs' in insn.getPrefixName():
for oper in insn.opers:
if (isinstance(oper, envi.archs.amd64.disasm.i386RegMemOper) and oper.disp == 0x60) or \
(isinstance(oper, envi.archs.amd64.disasm.i386ImmMemOper) and oper.imm == 0x60):
yield Characteristic('peb access', True), insn.va
yield Characteristic('peb access'), insn.va
else:
pass
@@ -329,10 +329,10 @@ def extract_insn_segment_access_features(f, bb, insn):
prefix = insn.getPrefixName()
if prefix == 'fs':
yield Characteristic('fs access', True), insn.va
yield Characteristic('fs access'), insn.va
if prefix == 'gs':
yield Characteristic('gs access', True), insn.va
yield Characteristic('gs access'), insn.va
def get_section(vw, va):
@@ -369,7 +369,7 @@ def extract_insn_cross_section_cflow(f, bb, insn):
continue
if get_section(f.vw, insn.va) != get_section(f.vw, va):
yield Characteristic('cross section flow', True), insn.va
yield Characteristic('cross section flow'), insn.va
except KeyError:
continue
@@ -387,7 +387,7 @@ def extract_function_calls_from(f, bb, insn):
if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper):
oper = insn.opers[0]
target = oper.getOperAddr(insn)
yield Characteristic('calls from', True), target
yield Characteristic('calls from'), target
# call via thunk on x86,
# see 9324d1a8ae37a36ae560c37448c9705a at 0x407985
@@ -396,18 +396,18 @@ def extract_function_calls_from(f, bb, insn):
# see Lab21-01.exe_:0x140001178
elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386PcRelOper):
target = insn.opers[0].getOperValue(insn)
yield Characteristic('calls from', True), target
yield Characteristic('calls from'), target
# call via IAT, x64
elif isinstance(insn.opers[0], envi.archs.amd64.disasm.Amd64RipRelOper):
op = insn.opers[0]
target = op.getOperAddr(insn)
yield Characteristic('calls from', True), target
yield Characteristic('calls from'), target
if target and target == f.va:
# if we found a jump target and it's the function address
# mark as recursive
yield Characteristic('recursive call', True), target
yield Characteristic('recursive call'), target
# this is a feature that's most relevant at the function or basic block scope,
@@ -423,13 +423,13 @@ def extract_function_indirect_call_characteristic_features(f, bb, insn):
# Checks below work for x86 and x64
if isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper):
# call edx
yield Characteristic('indirect call', True), insn.va
yield Characteristic('indirect call'), insn.va
elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegMemOper):
# call dword ptr [eax+50h]
yield Characteristic('indirect call', True), insn.va
yield Characteristic('indirect call'), insn.va
elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386SibOper):
# call qword ptr [rsp+78h]
yield Characteristic('indirect call', True), insn.va
yield Characteristic('indirect call'), insn.va
def extract_features(f, bb, insn):

View File

@@ -2,30 +2,21 @@ from capa.features import Feature
class Export(Feature):
def __init__(self, value):
def __init__(self, value, description=None):
# value is export name
super(Export, self).__init__([value])
super(Export, self).__init__([value], description)
self.value = value
def __str__(self):
return 'Export(%s)' % (self.value)
class Import(Feature):
def __init__(self, value):
def __init__(self, value, description=None):
# value is import name
super(Import, self).__init__([value])
super(Import, self).__init__([value], description)
self.value = value
def __str__(self):
return 'Import(%s)' % (self.value)
class Section(Feature):
def __init__(self, value):
def __init__(self, value, description=None):
# value is section name
super(Section, self).__init__([value])
super(Section, self).__init__([value], description)
self.value = value
def __str__(self):
return 'Section(%s)' % (self.value)

View File

@@ -2,45 +2,34 @@ from capa.features import Feature
class API(Feature):
def __init__(self, name):
def __init__(self, name, description=None):
# Downcase library name if given
if '.' in name:
modname, impname = name.split('.')
name = modname.lower() + '.' + impname
super(API, self).__init__([name])
super(API, self).__init__([name], description)
class Number(Feature):
def __init__(self, value, symbol=None):
super(Number, self).__init__([value])
def __init__(self, value, description=None):
super(Number, self).__init__([value], description)
self.value = value
self.symbol = symbol
def __str__(self):
if self.symbol:
return 'number(0x%x = %s)' % (self.value, self.symbol)
else:
return 'number(0x%x)' % (self.value)
def get_args_str(self):
return '0x%X' % self.value
class Offset(Feature):
def __init__(self, value, symbol=None):
def __init__(self, value, description=None):
super(Offset, self).__init__([value])
self.value = value
self.symbol = symbol
def __str__(self):
if self.symbol:
return 'offset(0x%x = %s)' % (self.value, self.symbol)
else:
return 'offset(0x%x)' % (self.value)
def get_args_str(self):
return '0x%X' % self.value
class Mnemonic(Feature):
def __init__(self, value):
super(Mnemonic, self).__init__([value])
def __init__(self, value, description=None):
super(Mnemonic, self).__init__([value], description)
self.value = value
def __str__(self):
return 'mnemonic(%s)' % (self.value)

View File

@@ -86,23 +86,11 @@ def convert_feature_to_result_document(feature):
"type": "characteristic"
},
"""
name, value = feature.freeze_serialize()
result = {'type': feature.name, feature.name: feature.get_args_str()}
if feature.description:
result['description'] = feature.description
# make the terms pretty
name = name.lower()
if name == 'matchedrule':
name = 'match'
# in the common case, there's a single argument
# so use it directly.
# like: name=number value=1
if isinstance(value, list) and len(value) == 1:
value = value[0]
return {
'type': name,
name: value,
}
return result
def convert_node_to_result_document(node):

View File

@@ -41,18 +41,19 @@ def render_statement(ostream, match, statement, indent=0):
# so, we have to inline some of the feature rendering here.
child = statement['child']
if child['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match'):
feature = '%s(%s)' % (child['type'], rutils.bold2(child[child['type']]))
if child['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match', 'characteristic'):
value = rutils.bold2(child[child['type']])
elif child['type'] in ('number', 'offset'):
feature = '%s(%s)' % (child['type'], rutils.bold2(rutils.hex(child[child['type']])))
value = rutils.bold2(rutils.hex(child[child['type']]))
elif child['type'] == 'bytes':
feature = '%s(%s)' % (child['type'], rutils.bold2(rutils.hex_string(child[child['type']])))
elif child['type'] == 'characteristic':
feature = 'characteristic(%s)' % (rutils.bold2(child['characteristic'][0]))
value = rutils.bold2(rutils.hex_string(child[child['type']]))
else:
raise RuntimeError('unexpected feature type: ' + str(child))
ostream.write('count(%s): ' % feature)
if child['description']:
ostream.write('count(%s(%s = %s)): ' % (child['type'], value, child['description']))
else:
ostream.write('count(%s(%s)): ' % (child['type'], value))
if statement['max'] == statement['min']:
ostream.write('%d' % (statement['min']))
@@ -80,7 +81,7 @@ def render_statement(ostream, match, statement, indent=0):
def render_feature(ostream, match, feature, indent=0):
ostream.write(' ' * indent)
if feature['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match'):
if feature['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match', 'characteristic'):
ostream.write(feature['type'])
ostream.write(': ')
ostream.write(rutils.bold2(feature[feature['type']]))
@@ -93,14 +94,15 @@ def render_feature(ostream, match, feature, indent=0):
# bytes is the uppercase, hex-encoded string.
# it should always be an even number of characters (its hex).
ostream.write(rutils.bold2(rutils.hex_string(feature[feature['type']])))
elif feature['type'] == 'characteristic':
ostream.write('characteristic(%s)' % (rutils.bold2(feature['characteristic'][0])))
# note that regex is found in `render_statement`
else:
raise RuntimeError('unexpected feature type: ' + str(feature))
render_locations(ostream, match)
if 'description' in feature:
ostream.write(' = ')
ostream.write(feature['description'])
render_locations(ostream, match)
ostream.write('\n')

View File

@@ -138,7 +138,7 @@ class InvalidRuleSet(ValueError):
def ensure_feature_valid_for_scope(scope, feature):
if isinstance(feature, capa.features.Characteristic):
if capa.features.Characteristic(feature.name) not in SUPPORTED_FEATURES[scope]:
if capa.features.Characteristic(feature.value) not in SUPPORTED_FEATURES[scope]:
raise InvalidRule('feature %s not support for scope %s' % (feature, scope))
elif not isinstance(feature, tuple(filter(lambda t: isinstance(t, type), SUPPORTED_FEATURES[scope]))):
raise InvalidRule('feature %s not support for scope %s' % (feature, scope))
@@ -205,9 +205,8 @@ def parse_feature(key):
return capa.features.insn.Mnemonic
elif key == 'basic blocks':
return capa.features.basicblock.BasicBlock
elif key.startswith('characteristic(') and key.endswith(')'):
characteristic = key[len('characteristic('):-len(')')]
return lambda v: capa.features.Characteristic(characteristic, v)
elif key == 'characteristic':
return capa.features.Characteristic
elif key == 'export':
return capa.features.file.Export
elif key == 'import':
@@ -220,18 +219,18 @@ def parse_feature(key):
raise InvalidRule('unexpected statement: %s' % key)
def parse_symbol(s, value_type):
def parse_description(s, value_type, description=None):
'''
s can be an int or a string
'''
if isinstance(s, str) and '=' in s:
value, symbol = s.split('=', 1)
symbol = symbol.strip()
if symbol == '':
raise InvalidRule('unexpected value: "%s", symbol name cannot be empty' % s)
if value_type != 'string' and isinstance(s, str) and ' = ' in s:
if description:
raise InvalidRule('unexpected value: "%s", only one description allowed (inline description with ` = `)' % s)
value, description = s.split(' = ', 1)
if description == '':
raise InvalidRule('unexpected value: "%s", description cannot be empty' % s)
else:
value = s
symbol = None
if isinstance(value, str):
if value_type == 'bytes':
@@ -244,17 +243,17 @@ def parse_symbol(s, value_type):
if len(value) > MAX_BYTES_FEATURE_SIZE:
raise InvalidRule('unexpected bytes value: byte sequences must be no larger than %s bytes' %
MAX_BYTES_FEATURE_SIZE)
else:
elif value_type in {'number', 'offset'}:
try:
value = parse_int(value)
except ValueError:
raise InvalidRule('unexpected value: "%s", must begin with numerical value' % value)
return value, symbol
return value, description
def build_statements(d, scope):
if len(d.keys()) != 1:
if len(d.keys()) > 2:
raise InvalidRule('too many statements')
key = list(d.keys())[0]
@@ -302,48 +301,33 @@ def build_statements(d, scope):
term = key[len('count('):-len(')')]
if term.startswith('characteristic('):
# characteristic features are specified a bit specially:
# they simply indicate the presence of something unusual/interesting,
# and we embed the name in the feature name, like `characteristic(nzxor)`.
#
# when we're dealing with counts, like `count(characteristic(nzxor))`,
# we can simply extract the feature and assume we're looking for `True` values.
Feature = parse_feature(term)
feature = Feature(True)
ensure_feature_valid_for_scope(scope, feature)
else:
# however, for remaining counted features, like `count(mnemonic(mov))`,
# we have to jump through hoops.
#
# when looking for the existance of such a feature, our rule might look like:
# - mnemonic: mov
#
# but here we deal with the form: `mnemonic(mov)`.
term, _, arg = term.partition('(')
Feature = parse_feature(term)
# when looking for the existence of such a feature, our rule might look like:
# - mnemonic: mov
#
# but here we deal with the form: `mnemonic(mov)`.
term, _, arg = term.partition('(')
Feature = parse_feature(term)
if arg:
arg = arg[:-len(')')]
# can't rely on yaml parsing ints embedded within strings
# like:
#
# count(offset(0xC))
# count(number(0x11223344))
# count(number(0x100 = symbol name))
if term in ('number', 'offset', 'bytes'):
value, symbol = parse_symbol(arg, term)
feature = Feature(value, symbol)
else:
# arg is string, like:
#
# count(mnemonic(mov))
# count(string(error))
# TODO: what about embedded newlines?
feature = Feature(arg)
if arg:
arg = arg[:-len(')')]
# can't rely on yaml parsing ints embedded within strings
# like:
#
# count(offset(0xC))
# count(number(0x11223344))
# count(number(0x100 = description))
if term != 'string':
value, description = parse_description(arg, term)
feature = Feature(value, description)
else:
feature = Feature()
ensure_feature_valid_for_scope(scope, feature)
# arg is string (which doesn't support inline descriptions), like:
#
# count(string(error))
# TODO: what about embedded newlines?
feature = Feature(arg)
else:
feature = Feature()
ensure_feature_valid_for_scope(scope, feature)
count = d[key]
if isinstance(count, int):
@@ -370,13 +354,8 @@ def build_statements(d, scope):
raise InvalidRule('invalid regular expression: %s it should use Python syntax, try it at https://pythex.org' % d[key])
else:
Feature = parse_feature(key)
if key in ('number', 'offset', 'bytes'):
# parse numbers with symbol description, e.g. 0x4550 = IMAGE_DOS_SIGNATURE
# or regular numbers, e.g. 37
value, symbol = parse_symbol(d[key], key)
feature = Feature(value, symbol)
else:
feature = Feature(d[key])
value, description = parse_description(d[key], key, d.get('description'))
feature = Feature(value, description)
ensure_feature_valid_for_scope(scope, feature)
return feature

2
rules

Submodule rules updated: e5db226844...da61c9138e

View File

@@ -12,23 +12,23 @@ from fixtures import *
EXTRACTOR = capa.features.extractors.NullFeatureExtractor({
'file features': [
(0x402345, capa.features.Characteristic('embedded pe', True)),
(0x402345, capa.features.Characteristic('embedded pe')),
],
'functions': {
0x401000: {
'features': [
(0x401000, capa.features.Characteristic('switch', True)),
(0x401000, capa.features.Characteristic('switch')),
],
'basic blocks': {
0x401000: {
'features': [
(0x401000, capa.features.Characteristic('tight loop', True)),
(0x401000, capa.features.Characteristic('tight loop')),
],
'instructions': {
0x401000: {
'features': [
(0x401000, capa.features.insn.Mnemonic('xor')),
(0x401000, capa.features.Characteristic('nzxor', True)),
(0x401000, capa.features.Characteristic('nzxor')),
],
},
0x401002: {
@@ -57,9 +57,9 @@ def test_null_feature_extractor():
scope: basic block
features:
- and:
- characteristic(tight loop): true
- characteristic: tight loop
- mnemonic: xor
- characteristic(nzxor): true
- characteristic: nzxor
''')),
])
capabilities = capa.main.find_capabilities(rules, EXTRACTOR)
@@ -150,7 +150,7 @@ def test_serialize_features():
roundtrip_feature(capa.features.insn.Offset(0x0))
roundtrip_feature(capa.features.insn.Mnemonic('push'))
roundtrip_feature(capa.features.file.Section('.rsrc'))
roundtrip_feature(capa.features.Characteristic('tight loop', True))
roundtrip_feature(capa.features.Characteristic('tight loop'))
roundtrip_feature(capa.features.basicblock.BasicBlock())
roundtrip_feature(capa.features.file.Export('BaseThreadInitThunk'))
roundtrip_feature(capa.features.file.Import('kernel32.IsWow64Process'))

View File

@@ -42,7 +42,7 @@ def test_ruleset():
name: file rule
scope: file
features:
- characteristic(embedded pe): y
- characteristic: embedded pe
''')),
capa.rules.Rule.from_yaml(textwrap.dedent('''
rule:
@@ -50,7 +50,7 @@ def test_ruleset():
name: function rule
scope: function
features:
- characteristic(switch): y
- characteristic: switch
''')),
capa.rules.Rule.from_yaml(textwrap.dedent('''
rule:
@@ -58,7 +58,7 @@ def test_ruleset():
name: basic block rule
scope: basic block
features:
- characteristic(nzxor): y
- characteristic: nzxor
''')),
])
@@ -128,7 +128,7 @@ def test_match_across_scopes(sample_9324d1a8ae37a36ae560c37448c9705a):
examples:
- 9324d1a8ae37a36ae560c37448c9705a:0x403685
features:
- characteristic(tight loop): true
- characteristic: tight loop
''')),
# this rule should match on a function (0x403660)
# based on API, as well as prior basic block rule match
@@ -176,7 +176,7 @@ def test_subscope_bb_rules(sample_9324d1a8ae37a36ae560c37448c9705a):
features:
- and:
- basic block:
- characteristic(tight loop): true
- characteristic: tight loop
'''))
])
# tight loop at 0x403685

View File

@@ -4,6 +4,7 @@ import pytest
import capa.rules
from capa.features.insn import Number, Offset
from capa.features import String
def test_rule_ctor():
@@ -56,6 +57,22 @@ def test_rule_yaml_complex():
assert r.evaluate({Number(6): {1}, Number(7): {1}, Number(8): {1}}) == False
def test_rule_yaml_descriptions():
rule = textwrap.dedent('''
rule:
meta:
name: test rule
features:
- and:
- number: 1 = This is the number 1
- string: This program cannot be run in DOS mode.
description: MS-DOS stub message
- count(number(2 = AF_INET/SOCK_DGRAM)): 2
''')
r = capa.rules.Rule.from_yaml(rule)
assert r.evaluate({Number(1): {1}, Number(2): {2, 3}, String('This program cannot be run in DOS mode.'): {4}}) == True
def test_rule_yaml_not():
rule = textwrap.dedent('''
rule:
@@ -118,7 +135,7 @@ def test_invalid_rule_feature():
name: test rule
scope: file
features:
- characteristic(nzxor): true
- characteristic: nzxor
'''))
with pytest.raises(capa.rules.InvalidRule):
@@ -128,7 +145,7 @@ def test_invalid_rule_feature():
name: test rule
scope: function
features:
- characteristic(embedded pe): true
- characteristic: embedded pe
'''))
with pytest.raises(capa.rules.InvalidRule):
@@ -138,7 +155,7 @@ def test_invalid_rule_feature():
name: test rule
scope: basic block
features:
- characteristic(embedded pe): true
- characteristic: embedded pe
'''))
@@ -173,11 +190,11 @@ def test_subscope_rules():
scope: file
features:
- and:
- characteristic(embedded pe): true
- characteristic: embedded pe
- function:
- and:
- characteristic(nzxor): true
- characteristic(switch): true
- characteristic: nzxor
- characteristic: switch
'''))
])
# the file rule scope will have one rules:
@@ -229,7 +246,7 @@ def test_invalid_rules():
meta:
name: test rule
features:
- characteristic(number(1)): True
- characteristic: number(1)
'''))
with pytest.raises(capa.rules.InvalidRule):
@@ -238,7 +255,7 @@ def test_invalid_rules():
meta:
name: test rule
features:
- characteristic(count(number(100))): True
- characteristic: count(number(100))
'''))

View File

@@ -116,7 +116,7 @@ def test_offset_features(mimikatz):
def test_nzxor_features(mimikatz):
features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x410DFC))
assert capa.features.Characteristic('nzxor', True) in features # 0x0410F0B
assert capa.features.Characteristic('nzxor') in features # 0x0410F0B
def get_bb_insn(f, va):
@@ -154,7 +154,7 @@ def test_mnemonic_features(mimikatz):
def test_peb_access_features(sample_a933a1a402775cfa94b6bee0963f4b46):
features = extract_function_features(viv_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.vw, 0xABA6FEC))
assert capa.features.Characteristic('peb access', True) in features
assert capa.features.Characteristic('peb access') in features
def test_file_section_name_features(mimikatz):
@@ -170,7 +170,7 @@ def test_tight_loop_features(mimikatz):
if bb.va != 0x402F8E:
continue
features = extract_basic_block_features(f, bb)
assert capa.features.Characteristic('tight loop', True) in features
assert capa.features.Characteristic('tight loop') in features
assert capa.features.basicblock.BasicBlock() in features
@@ -180,7 +180,7 @@ def test_tight_loop_bb_features(mimikatz):
if bb.va != 0x402F8E:
continue
features = extract_basic_block_features(f, bb)
assert capa.features.Characteristic('tight loop', True) in features
assert capa.features.Characteristic('tight loop') in features
assert capa.features.basicblock.BasicBlock() in features
@@ -202,17 +202,17 @@ def test_file_import_name_features(mimikatz):
def test_cross_section_flow_features(sample_a198216798ca38f280dc413f8c57f2c2):
features = extract_function_features(viv_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.vw, 0x4014D0))
assert capa.features.Characteristic('cross section flow', True) in features
assert capa.features.Characteristic('cross section flow') in features
# this function has calls to some imports,
# which should not trigger cross-section flow characteristic
features = extract_function_features(viv_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.vw, 0x401563))
assert capa.features.Characteristic('cross section flow', True) not in features
assert capa.features.Characteristic('cross section flow') not in features
def test_segment_access_features(sample_a933a1a402775cfa94b6bee0963f4b46):
features = extract_function_features(viv_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.vw, 0xABA6FEC))
assert capa.features.Characteristic('fs access', True) in features
assert capa.features.Characteristic('fs access') in features
def test_thunk_features(sample_9324d1a8ae37a36ae560c37448c9705a):
@@ -223,36 +223,36 @@ def test_thunk_features(sample_9324d1a8ae37a36ae560c37448c9705a):
def test_file_embedded_pe(pma_lab_12_04):
features = extract_file_features(pma_lab_12_04.vw, pma_lab_12_04.path)
assert capa.features.Characteristic('embedded pe', True) in features
assert capa.features.Characteristic('embedded pe') in features
def test_stackstring_features(mimikatz):
features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x4556E5))
assert capa.features.Characteristic('stack string', True) in features
assert capa.features.Characteristic('stack string') in features
def test_switch_features(mimikatz):
features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x409411))
assert capa.features.Characteristic('switch', True) in features
assert capa.features.Characteristic('switch') in features
features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x409393))
assert capa.features.Characteristic('switch', True) not in features
assert capa.features.Characteristic('switch') not in features
def test_recursive_call_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41):
features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10003100))
assert capa.features.Characteristic('recursive call', True) in features
assert capa.features.Characteristic('recursive call') in features
features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10007B00))
assert capa.features.Characteristic('recursive call', True) not in features
assert capa.features.Characteristic('recursive call') not in features
def test_loop_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41):
features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10003D30))
assert capa.features.Characteristic('loop', True) in features
assert capa.features.Characteristic('loop') in features
features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10007250))
assert capa.features.Characteristic('loop', True) not in features
assert capa.features.Characteristic('loop') not in features
def test_file_string_features(sample_bfb9b5391a13d0afd787e87ab90f14f5):
@@ -263,20 +263,20 @@ def test_file_string_features(sample_bfb9b5391a13d0afd787e87ab90f14f5):
def test_function_calls_to(sample_9324d1a8ae37a36ae560c37448c9705a):
features = extract_function_features(viv_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.vw, 0x406F60))
assert capa.features.Characteristic('calls to', True) in features
assert len(features[capa.features.Characteristic('calls to', True)]) == 1
assert capa.features.Characteristic('calls to') in features
assert len(features[capa.features.Characteristic('calls to')]) == 1
def test_function_calls_to64(sample_lab21_01):
features = extract_function_features(viv_utils.Function(sample_lab21_01.vw, 0x1400052D0)) # memcpy
assert capa.features.Characteristic('calls to', True) in features
assert len(features[capa.features.Characteristic('calls to', True)]) == 8
assert capa.features.Characteristic('calls to') in features
assert len(features[capa.features.Characteristic('calls to')]) == 8
def test_function_calls_from(sample_9324d1a8ae37a36ae560c37448c9705a):
features = extract_function_features(viv_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.vw, 0x406F60))
assert capa.features.Characteristic('calls from', True) in features
assert len(features[capa.features.Characteristic('calls from', True)]) == 23
assert capa.features.Characteristic('calls from') in features
assert len(features[capa.features.Characteristic('calls from')]) == 23
def test_basic_block_count(sample_9324d1a8ae37a36ae560c37448c9705a):
@@ -286,8 +286,8 @@ def test_basic_block_count(sample_9324d1a8ae37a36ae560c37448c9705a):
def test_indirect_call_features(sample_a933a1a402775cfa94b6bee0963f4b46):
features = extract_function_features(viv_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.vw, 0xABA68A0))
assert capa.features.Characteristic('indirect call', True) in features
assert len(features[capa.features.Characteristic('indirect call', True)]) == 3
assert capa.features.Characteristic('indirect call') in features
assert len(features[capa.features.Characteristic('indirect call')]) == 3
def test_indirect_calls_resolved(sample_c91887d861d9bd4a5872249b641bc9f9):