diff --git a/README.md b/README.md index b8647373..bbc8fc0d 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,7 @@ Download capa from the [Releases](/releases) page or get the nightly builds here - [section](#section) - [counting](#counting) - [matching prior rule matches](#matching-prior-rule-matches) + - [descriptions](#descriptions) - [limitations](#Limitations) # installation @@ -339,8 +340,9 @@ For example, a crypto constant. The parameter is a number; if prefixed with `0x` then in hex format, otherwise, decimal format. -To associate context with a number, e.g. for constant definitions, append an equal sign and the respective name to -the number definition. This helps with documenting rules and provides context in capa's output. +To help humans understand the meaning of a number, such that the constant `0x40` means `PAGE_EXECUTE_READWRITE`, you may provide a description alongside the definition. +Use the inline syntax (preferred) by ending the line with ` = DESCRIPTION STRING`. +Check the [description section](#description) for more details. Examples: @@ -362,20 +364,31 @@ Regexes should be surrounded with `/` characters. By default, capa uses case-sensitive matching and assumes leading and trailing wildcards. To perform case-insensitive matching append an `i`. To anchor the regex at the start or end of a string, use `^` and/or `$`. +To add context to a string use the two-line syntax, using the `description` tag: `description: DESCRIPTION STRING`. +The inline syntax is not supported. +Check the [description section](#description) for more details. + Examples: - string: This program cannot be run in DOS mode. - string: Firefox 64.0 - string: /SELECT.*FROM.*WHERE/ - string: /Hardware\\Description\\System\\CentralProcessor/i - +``` +- string: This program cannot be run in DOS mode. + description: MS-DOS stub message +- string: '{3E5FC7F9-9A51-4367-9063-A120244FBEC7}' + description: CLSID_CMSTPLUA +- string: Firefox 64.0 +- string:'/SELECT.*FROM.*WHERE/ +- string: /Hardware\\Description\\System\\CentralProcessor/i +``` + Note that regex matching is expensive (`O(features)` rather than `O(1)`) so they should be used sparingly. ### bytes A sequence of bytes referenced by the logic of the program. The provided sequence must match from the beginning of the referenced bytes and be no more than `0x100` bytes. -The parameter is a sequence of hexadecimal bytes followed by an optional description. - +The parameter is a sequence of hexadecimal bytes. +To help humans understand the meaning of the bytes sequence, you may provide a description. +Use the inline syntax (preferred) by ending the line with ` = DESCRIPTION STRING`. +Check the [description section](#description) for more details. The example below illustrates byte matching given a COM CLSID pushed onto the stack prior to `CoCreateInstance`. @@ -397,6 +410,7 @@ A structure offset referenced by the logic of the program. This should not be a stack offset. The parameter is a number; if prefixed with `0x` then in hex format, otherwise, decimal format. +It can be followed by an optional description. Examples: @@ -453,6 +467,7 @@ These are the features supported at the file-scope: - [import](#import) - [section](#section) + ### file string An ASCII or UTF-16 LE string present in the file. @@ -511,6 +526,10 @@ These rules can be expressed like: count(mnemonic(mov)): 3 count(basic block): 4 +`count` supports inline descriptions, except for [strings](#string), using the following syntax: + + count(number(2 = AF_INET/SOCK_DGRAM)): 2 + ## matching prior rule matches capa rules can specify logic for matching on other rule matches. @@ -532,6 +551,28 @@ By default, library rules will not be output to the user as a rule match, but can be matched by other rules. When no active rules depend on a library rule, these the library rules will not be evaluated - maintaining performance. +## description + +All features support an optional description which helps with documenting rules and provides context in capa's output. +For all features except for [strings](#string), the description can be specified inline preceded by ` = `: ` = DESCRIPTION STRING`. +For example: + +``` +- number: 0x4550 = IMAGE_DOS_SIGNATURE (MZ) +``` + +The inline syntax is preferred. +For [strings](#string) or if the description is long or contains newlines, use the two-line syntax. +It uses the `description` tag in the following way: `description: DESCRIPTION STRING` +For example: + +``` +- string: This program cannot be run in DOS mode. + description: MS-DOS stub message +- number: 0x4550 + description: IMAGE_DOS_SIGNATURE (MZ) +``` + # limitations To learn more about capa's current limitations see [here](doc/limitations.md). diff --git a/capa/features/__init__.py b/capa/features/__init__.py index 9fec2d76..28931ad9 100644 --- a/capa/features/__init__.py +++ b/capa/features/__init__.py @@ -17,10 +17,11 @@ def bytes_to_str(b): class Feature(object): - def __init__(self, args): + def __init__(self, args, description=None): super(Feature, self).__init__() - self.name = self.__class__.__name__ + self.name = self.__class__.__name__.lower() self.args = args + self.description = description def __hash__(self): return hash((self.name, tuple(self.args))) @@ -28,8 +29,16 @@ class Feature(object): def __eq__(self, other): return self.name == other.name and self.args == other.args + # Used to overwrite the rendering of the feature args in `__str__` and the + # json output + def get_args_str(self): + return ','.join(self.args) + def __str__(self): - return '%s(%s)' % (self.name.lower(), ','.join(self.args)) + if self.description: + return '%s(%s = %s)' % (self.name, self.get_args_str(), self.description) + else: + return '%s(%s)' % (self.name, self.get_args_str()) def __repr__(self): return str(self) @@ -50,51 +59,41 @@ class Feature(object): class MatchedRule(Feature): - def __init__(self, rule_name): - super(MatchedRule, self).__init__([rule_name]) + def __init__(self, rule_name, description=None): + super(MatchedRule, self).__init__([rule_name], description) + self.name = 'match' self.rule_name = rule_name - def __str__(self): - return 'match(%s)' % (self.rule_name) - class Characteristic(Feature): - def __init__(self, name, value=None): - ''' - when `value` is not provided, this serves as descriptor for a class of characteristics. - this is only used internally, such as in `rules.py` when checking if a statement is - supported by a given scope. - ''' - super(Characteristic, self).__init__([name, value]) - self.name = name + def __init__(self, value, description=None): + super(Characteristic, self).__init__([value], description) self.value = value - def evaluate(self, ctx): - if self.value is None: - raise ValueError('cannot evaluate characteristc %s with empty value' % (str(self))) - return super(Characteristic, self).evaluate(ctx) + def freeze_serialize(self): + # in an older version of capa, characteristics could theoretically match non-existence (value=False). + # but we found this was never used (and better expressed with `not: characteristic: ...`). + # this was represented using an additional parameter for Characteristic. + # its been removed, but we keep it around in the freeze format to maintain backwards compatibility. + # this value is ignored, however. + return (self.__class__.__name__, [self.value, True]) - def __str__(self): - if self.value is None: - return 'characteristic(%s)' % (self.name) - else: - return 'characteristic(%s(%s))' % (self.name, self.value) + @classmethod + def freeze_deserialize(cls, args): + # see above. we ignore the second element in the 2-tuple here. + return cls(args[0]) class String(Feature): - def __init__(self, value): - super(String, self).__init__([value]) + def __init__(self, value, description=None): + super(String, self).__init__([value], description) self.value = value - def __str__(self): - return 'string("%s")' % (self.value) - class Bytes(Feature): - def __init__(self, value, symbol=None): - super(Bytes, self).__init__([value]) + def __init__(self, value, description=None): + super(Bytes, self).__init__([value], description) self.value = value - self.symbol = symbol def evaluate(self, ctx): for feature, locations in ctx.items(): @@ -106,11 +105,8 @@ class Bytes(Feature): return capa.engine.Result(False, self, []) - def __str__(self): - if self.symbol: - return 'bytes(0x%s = %s)' % (bytes_to_str(self.value).upper(), self.symbol) - else: - return 'bytes(0x%s)' % (bytes_to_str(self.value).upper()) + def get_args_str(self): + return bytes_to_str(self.value).upper() def freeze_serialize(self): return (self.__class__.__name__, diff --git a/capa/features/extractors/__init__.py b/capa/features/extractors/__init__.py index 0486a63a..f8cfa941 100644 --- a/capa/features/extractors/__init__.py +++ b/capa/features/extractors/__init__.py @@ -184,22 +184,22 @@ class NullFeatureExtractor(FeatureExtractor): extractor = NullFeatureExtractor({ 'file features': [ - (0x402345, capa.features.Characteristic('embedded pe', True)), + (0x402345, capa.features.Characteristic('embedded pe')), ], 'functions': { 0x401000: { 'features': [ - (0x401000, capa.features.Characteristic('switch', True)), + (0x401000, capa.features.Characteristic('switch')), ], 'basic blocks': { 0x401000: { 'features': [ - (0x401000, capa.features.Characteristic('tight-loop', True)), + (0x401000, capa.features.Characteristic('tight-loop')), ], 'instructions': { 0x401000: { 'features': [ - (0x401000, capa.features.Characteristic('nzxor', True)), + (0x401000, capa.features.Characteristic('nzxor')), ], }, 0x401002: ... diff --git a/capa/features/extractors/ida/basicblock.py b/capa/features/extractors/ida/basicblock.py index 51ba648a..e4756390 100644 --- a/capa/features/extractors/ida/basicblock.py +++ b/capa/features/extractors/ida/basicblock.py @@ -103,7 +103,7 @@ def extract_bb_stackstring(f, bb): bb (IDA BasicBlock) ''' if _ida_bb_contains_stackstring(f, bb): - yield Characteristic('stack string', True), bb.start_ea + yield Characteristic('stack string'), bb.start_ea def _ida_bb_contains_tight_loop(f, bb): @@ -133,7 +133,7 @@ def extract_bb_tight_loop(f, bb): bb (IDA BasicBlock) ''' if _ida_bb_contains_tight_loop(f, bb): - yield Characteristic('tight loop', True), bb.start_ea + yield Characteristic('tight loop'), bb.start_ea def extract_features(f, bb): diff --git a/capa/features/extractors/ida/file.py b/capa/features/extractors/ida/file.py index f75bf148..4b00a84a 100644 --- a/capa/features/extractors/ida/file.py +++ b/capa/features/extractors/ida/file.py @@ -68,7 +68,7 @@ def extract_file_embedded_pe(): continue for ea, _ in _ida_check_segment_for_pe(seg): - yield Characteristic('embedded pe', True), ea + yield Characteristic('embedded pe'), ea def extract_file_export_names(): diff --git a/capa/features/extractors/ida/function.py b/capa/features/extractors/ida/function.py index 0712ec8b..c6f55d36 100644 --- a/capa/features/extractors/ida/function.py +++ b/capa/features/extractors/ida/function.py @@ -29,7 +29,7 @@ def extract_function_switch(f): f (IDA func_t) ''' if _ida_function_contains_switch(f): - yield Characteristic('switch', True), f.start_ea + yield Characteristic('switch'), f.start_ea def extract_function_calls_to(f): @@ -39,7 +39,7 @@ def extract_function_calls_to(f): f (IDA func_t) ''' for ea in idautils.CodeRefsTo(f.start_ea, True): - yield Characteristic('calls to', True), ea + yield Characteristic('calls to'), ea def extract_function_loop(f): @@ -53,7 +53,7 @@ def extract_function_loop(f): map(lambda s: edges.append((bb.start_ea, s.start_ea)), bb.succs()) if edges and loops.has_loop(edges): - yield Characteristic('loop', True), f.start_ea + yield Characteristic('loop'), f.start_ea def extract_recursive_call(f): @@ -64,7 +64,7 @@ def extract_recursive_call(f): ''' for ref in idautils.CodeRefsTo(f.start_ea, True): if f.contains(ref): - yield Characteristic('recursive call', True), f.start_ea + yield Characteristic('recursive call'), f.start_ea break diff --git a/capa/features/extractors/ida/insn.py b/capa/features/extractors/ida/insn.py index 3526d67d..59c7eeb3 100644 --- a/capa/features/extractors/ida/insn.py +++ b/capa/features/extractors/ida/insn.py @@ -259,7 +259,7 @@ def extract_insn_nzxor_characteristic_features(f, bb, insn): if _is_nzxor_stack_cookie(f, bb, insn): return - yield Characteristic('nzxor', True), insn.ea + yield Characteristic('nzxor'), insn.ea def extract_insn_mnemonic_features(f, bb, insn): @@ -292,7 +292,7 @@ def extract_insn_peb_access_characteristic_features(f, bb, insn): if ' fs:30h' in disasm or ' gs:60h' in disasm: # TODO: replace above with proper IDA - yield Characteristic('peb access', True), insn.ea + yield Characteristic('peb access'), insn.ea def extract_insn_segment_access_features(f, bb, insn): @@ -309,11 +309,11 @@ def extract_insn_segment_access_features(f, bb, insn): if ' fs:' in disasm: # TODO: replace above with proper IDA - yield Characteristic('fs access', True), insn.ea + yield Characteristic('fs access'), insn.ea if ' gs:' in disasm: # TODO: replace above with proper IDA - yield Characteristic('gs access', True), insn.ea + yield Characteristic('gs access'), insn.ea def extract_insn_cross_section_cflow(f, bb, insn): @@ -336,7 +336,7 @@ def extract_insn_cross_section_cflow(f, bb, insn): if idaapi.getseg(ref) == idaapi.getseg(insn.ea): continue - yield Characteristic('cross section flow', True), insn.ea + yield Characteristic('cross section flow'), insn.ea def extract_function_calls_from(f, bb, insn): @@ -354,7 +354,7 @@ def extract_function_calls_from(f, bb, insn): return for ref in idautils.CodeRefsFrom(insn.ea, False): - yield Characteristic('calls from', True), ref + yield Characteristic('calls from'), ref def extract_function_indirect_call_characteristic_features(f, bb, insn): @@ -373,7 +373,7 @@ def extract_function_indirect_call_characteristic_features(f, bb, insn): return if idc.get_operand_type(insn.ea, 0) in (idc.o_reg, idc.o_phrase, idc.o_displ): - yield Characteristic('indirect call', True), insn.ea + yield Characteristic('indirect call'), insn.ea def extract_features(f, bb, insn): diff --git a/capa/features/extractors/viv/basicblock.py b/capa/features/extractors/viv/basicblock.py index a7a6ef5c..ad1be20e 100644 --- a/capa/features/extractors/viv/basicblock.py +++ b/capa/features/extractors/viv/basicblock.py @@ -39,7 +39,7 @@ def _bb_has_tight_loop(f, bb): def extract_bb_tight_loop(f, bb): ''' check basic block for tight loop indicators ''' if _bb_has_tight_loop(f, bb): - yield Characteristic('tight loop', True), bb.va + yield Characteristic('tight loop'), bb.va def _bb_has_stackstring(f, bb): @@ -62,7 +62,7 @@ def _bb_has_stackstring(f, bb): def extract_stackstring(f, bb): ''' check basic block for stackstring indicators ''' if _bb_has_stackstring(f, bb): - yield Characteristic('stack string', True), bb.va + yield Characteristic('stack string'), bb.va def is_mov_imm_to_stack(instr): diff --git a/capa/features/extractors/viv/file.py b/capa/features/extractors/viv/file.py index 78678c77..4f35c601 100644 --- a/capa/features/extractors/viv/file.py +++ b/capa/features/extractors/viv/file.py @@ -13,7 +13,7 @@ def extract_file_embedded_pe(vw, file_path): fbytes = f.read() for offset, i in pe_carve.carve(fbytes, 1): - yield Characteristic('embedded pe', True), offset + yield Characteristic('embedded pe'), offset def extract_file_export_names(vw, file_path): diff --git a/capa/features/extractors/viv/function.py b/capa/features/extractors/viv/function.py index 43271c13..8efffac3 100644 --- a/capa/features/extractors/viv/function.py +++ b/capa/features/extractors/viv/function.py @@ -53,12 +53,12 @@ def extract_function_switch(f): method can be optimized ''' if f.va in get_functions_with_switch(f.vw): - yield Characteristic('switch', True), f.va + yield Characteristic('switch'), f.va def extract_function_calls_to(f): for src, _, _, _ in f.vw.getXrefsTo(f.va, rtype=vivisect.const.REF_CODE): - yield Characteristic('calls to', True), src + yield Characteristic('calls to'), src def extract_function_loop(f): @@ -74,7 +74,7 @@ def extract_function_loop(f): edges.append((bb.va, bva)) if edges and loops.has_loop(edges): - yield Characteristic('loop', True), f.va + yield Characteristic('loop'), f.va def extract_features(f): diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py index 08cc431a..1b4c68c2 100644 --- a/capa/features/extractors/viv/insn.py +++ b/capa/features/extractors/viv/insn.py @@ -286,7 +286,7 @@ def extract_insn_nzxor_characteristic_features(f, bb, insn): if is_security_cookie(f, bb, insn): return - yield Characteristic('nzxor', True), insn.va + yield Characteristic('nzxor'), insn.va def extract_insn_mnemonic_features(f, bb, insn): @@ -314,12 +314,12 @@ def extract_insn_peb_access_characteristic_features(f, bb, insn): # fs: push dword [eax + 0x30] ; i386RegMemOper, with eax = 0 if (isinstance(oper, envi.archs.i386.disasm.i386RegMemOper) and oper.disp == 0x30) or \ (isinstance(oper, envi.archs.i386.disasm.i386ImmMemOper) and oper.imm == 0x30): - yield Characteristic('peb access', True), insn.va + yield Characteristic('peb access'), insn.va elif 'gs' in insn.getPrefixName(): for oper in insn.opers: if (isinstance(oper, envi.archs.amd64.disasm.i386RegMemOper) and oper.disp == 0x60) or \ (isinstance(oper, envi.archs.amd64.disasm.i386ImmMemOper) and oper.imm == 0x60): - yield Characteristic('peb access', True), insn.va + yield Characteristic('peb access'), insn.va else: pass @@ -329,10 +329,10 @@ def extract_insn_segment_access_features(f, bb, insn): prefix = insn.getPrefixName() if prefix == 'fs': - yield Characteristic('fs access', True), insn.va + yield Characteristic('fs access'), insn.va if prefix == 'gs': - yield Characteristic('gs access', True), insn.va + yield Characteristic('gs access'), insn.va def get_section(vw, va): @@ -369,7 +369,7 @@ def extract_insn_cross_section_cflow(f, bb, insn): continue if get_section(f.vw, insn.va) != get_section(f.vw, va): - yield Characteristic('cross section flow', True), insn.va + yield Characteristic('cross section flow'), insn.va except KeyError: continue @@ -387,7 +387,7 @@ def extract_function_calls_from(f, bb, insn): if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper): oper = insn.opers[0] target = oper.getOperAddr(insn) - yield Characteristic('calls from', True), target + yield Characteristic('calls from'), target # call via thunk on x86, # see 9324d1a8ae37a36ae560c37448c9705a at 0x407985 @@ -396,18 +396,18 @@ def extract_function_calls_from(f, bb, insn): # see Lab21-01.exe_:0x140001178 elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386PcRelOper): target = insn.opers[0].getOperValue(insn) - yield Characteristic('calls from', True), target + yield Characteristic('calls from'), target # call via IAT, x64 elif isinstance(insn.opers[0], envi.archs.amd64.disasm.Amd64RipRelOper): op = insn.opers[0] target = op.getOperAddr(insn) - yield Characteristic('calls from', True), target + yield Characteristic('calls from'), target if target and target == f.va: # if we found a jump target and it's the function address # mark as recursive - yield Characteristic('recursive call', True), target + yield Characteristic('recursive call'), target # this is a feature that's most relevant at the function or basic block scope, @@ -423,13 +423,13 @@ def extract_function_indirect_call_characteristic_features(f, bb, insn): # Checks below work for x86 and x64 if isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper): # call edx - yield Characteristic('indirect call', True), insn.va + yield Characteristic('indirect call'), insn.va elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegMemOper): # call dword ptr [eax+50h] - yield Characteristic('indirect call', True), insn.va + yield Characteristic('indirect call'), insn.va elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386SibOper): # call qword ptr [rsp+78h] - yield Characteristic('indirect call', True), insn.va + yield Characteristic('indirect call'), insn.va def extract_features(f, bb, insn): diff --git a/capa/features/file.py b/capa/features/file.py index 708b8e2b..396edd1f 100644 --- a/capa/features/file.py +++ b/capa/features/file.py @@ -2,30 +2,21 @@ from capa.features import Feature class Export(Feature): - def __init__(self, value): + def __init__(self, value, description=None): # value is export name - super(Export, self).__init__([value]) + super(Export, self).__init__([value], description) self.value = value - def __str__(self): - return 'Export(%s)' % (self.value) - class Import(Feature): - def __init__(self, value): + def __init__(self, value, description=None): # value is import name - super(Import, self).__init__([value]) + super(Import, self).__init__([value], description) self.value = value - def __str__(self): - return 'Import(%s)' % (self.value) - class Section(Feature): - def __init__(self, value): + def __init__(self, value, description=None): # value is section name - super(Section, self).__init__([value]) + super(Section, self).__init__([value], description) self.value = value - - def __str__(self): - return 'Section(%s)' % (self.value) diff --git a/capa/features/insn.py b/capa/features/insn.py index b8ebf9da..a353cb43 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -2,45 +2,34 @@ from capa.features import Feature class API(Feature): - def __init__(self, name): + def __init__(self, name, description=None): # Downcase library name if given if '.' in name: modname, impname = name.split('.') name = modname.lower() + '.' + impname - super(API, self).__init__([name]) + super(API, self).__init__([name], description) class Number(Feature): - def __init__(self, value, symbol=None): - super(Number, self).__init__([value]) + def __init__(self, value, description=None): + super(Number, self).__init__([value], description) self.value = value - self.symbol = symbol - def __str__(self): - if self.symbol: - return 'number(0x%x = %s)' % (self.value, self.symbol) - else: - return 'number(0x%x)' % (self.value) + def get_args_str(self): + return '0x%X' % self.value class Offset(Feature): - def __init__(self, value, symbol=None): + def __init__(self, value, description=None): super(Offset, self).__init__([value]) self.value = value - self.symbol = symbol - def __str__(self): - if self.symbol: - return 'offset(0x%x = %s)' % (self.value, self.symbol) - else: - return 'offset(0x%x)' % (self.value) + def get_args_str(self): + return '0x%X' % self.value class Mnemonic(Feature): - def __init__(self, value): - super(Mnemonic, self).__init__([value]) + def __init__(self, value, description=None): + super(Mnemonic, self).__init__([value], description) self.value = value - - def __str__(self): - return 'mnemonic(%s)' % (self.value) diff --git a/capa/render/__init__.py b/capa/render/__init__.py index e53aca1f..a1808f6e 100644 --- a/capa/render/__init__.py +++ b/capa/render/__init__.py @@ -86,23 +86,11 @@ def convert_feature_to_result_document(feature): "type": "characteristic" }, """ - name, value = feature.freeze_serialize() + result = {'type': feature.name, feature.name: feature.get_args_str()} + if feature.description: + result['description'] = feature.description - # make the terms pretty - name = name.lower() - if name == 'matchedrule': - name = 'match' - - # in the common case, there's a single argument - # so use it directly. - # like: name=number value=1 - if isinstance(value, list) and len(value) == 1: - value = value[0] - - return { - 'type': name, - name: value, - } + return result def convert_node_to_result_document(node): diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index 46dd09f8..2e5a7221 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -41,18 +41,19 @@ def render_statement(ostream, match, statement, indent=0): # so, we have to inline some of the feature rendering here. child = statement['child'] - if child['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match'): - feature = '%s(%s)' % (child['type'], rutils.bold2(child[child['type']])) + if child['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match', 'characteristic'): + value = rutils.bold2(child[child['type']]) elif child['type'] in ('number', 'offset'): - feature = '%s(%s)' % (child['type'], rutils.bold2(rutils.hex(child[child['type']]))) + value = rutils.bold2(rutils.hex(child[child['type']])) elif child['type'] == 'bytes': - feature = '%s(%s)' % (child['type'], rutils.bold2(rutils.hex_string(child[child['type']]))) - elif child['type'] == 'characteristic': - feature = 'characteristic(%s)' % (rutils.bold2(child['characteristic'][0])) + value = rutils.bold2(rutils.hex_string(child[child['type']])) else: raise RuntimeError('unexpected feature type: ' + str(child)) - ostream.write('count(%s): ' % feature) + if child['description']: + ostream.write('count(%s(%s = %s)): ' % (child['type'], value, child['description'])) + else: + ostream.write('count(%s(%s)): ' % (child['type'], value)) if statement['max'] == statement['min']: ostream.write('%d' % (statement['min'])) @@ -80,7 +81,7 @@ def render_statement(ostream, match, statement, indent=0): def render_feature(ostream, match, feature, indent=0): ostream.write(' ' * indent) - if feature['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match'): + if feature['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match', 'characteristic'): ostream.write(feature['type']) ostream.write(': ') ostream.write(rutils.bold2(feature[feature['type']])) @@ -93,14 +94,15 @@ def render_feature(ostream, match, feature, indent=0): # bytes is the uppercase, hex-encoded string. # it should always be an even number of characters (its hex). ostream.write(rutils.bold2(rutils.hex_string(feature[feature['type']]))) - elif feature['type'] == 'characteristic': - ostream.write('characteristic(%s)' % (rutils.bold2(feature['characteristic'][0]))) # note that regex is found in `render_statement` else: raise RuntimeError('unexpected feature type: ' + str(feature)) - render_locations(ostream, match) + if 'description' in feature: + ostream.write(' = ') + ostream.write(feature['description']) + render_locations(ostream, match) ostream.write('\n') diff --git a/capa/rules.py b/capa/rules.py index 8e42d0fc..7bcc6695 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -138,7 +138,7 @@ class InvalidRuleSet(ValueError): def ensure_feature_valid_for_scope(scope, feature): if isinstance(feature, capa.features.Characteristic): - if capa.features.Characteristic(feature.name) not in SUPPORTED_FEATURES[scope]: + if capa.features.Characteristic(feature.value) not in SUPPORTED_FEATURES[scope]: raise InvalidRule('feature %s not support for scope %s' % (feature, scope)) elif not isinstance(feature, tuple(filter(lambda t: isinstance(t, type), SUPPORTED_FEATURES[scope]))): raise InvalidRule('feature %s not support for scope %s' % (feature, scope)) @@ -205,9 +205,8 @@ def parse_feature(key): return capa.features.insn.Mnemonic elif key == 'basic blocks': return capa.features.basicblock.BasicBlock - elif key.startswith('characteristic(') and key.endswith(')'): - characteristic = key[len('characteristic('):-len(')')] - return lambda v: capa.features.Characteristic(characteristic, v) + elif key == 'characteristic': + return capa.features.Characteristic elif key == 'export': return capa.features.file.Export elif key == 'import': @@ -220,18 +219,18 @@ def parse_feature(key): raise InvalidRule('unexpected statement: %s' % key) -def parse_symbol(s, value_type): +def parse_description(s, value_type, description=None): ''' s can be an int or a string ''' - if isinstance(s, str) and '=' in s: - value, symbol = s.split('=', 1) - symbol = symbol.strip() - if symbol == '': - raise InvalidRule('unexpected value: "%s", symbol name cannot be empty' % s) + if value_type != 'string' and isinstance(s, str) and ' = ' in s: + if description: + raise InvalidRule('unexpected value: "%s", only one description allowed (inline description with ` = `)' % s) + value, description = s.split(' = ', 1) + if description == '': + raise InvalidRule('unexpected value: "%s", description cannot be empty' % s) else: value = s - symbol = None if isinstance(value, str): if value_type == 'bytes': @@ -244,17 +243,17 @@ def parse_symbol(s, value_type): if len(value) > MAX_BYTES_FEATURE_SIZE: raise InvalidRule('unexpected bytes value: byte sequences must be no larger than %s bytes' % MAX_BYTES_FEATURE_SIZE) - else: + elif value_type in {'number', 'offset'}: try: value = parse_int(value) except ValueError: raise InvalidRule('unexpected value: "%s", must begin with numerical value' % value) - return value, symbol + return value, description def build_statements(d, scope): - if len(d.keys()) != 1: + if len(d.keys()) > 2: raise InvalidRule('too many statements') key = list(d.keys())[0] @@ -302,48 +301,33 @@ def build_statements(d, scope): term = key[len('count('):-len(')')] - if term.startswith('characteristic('): - # characteristic features are specified a bit specially: - # they simply indicate the presence of something unusual/interesting, - # and we embed the name in the feature name, like `characteristic(nzxor)`. - # - # when we're dealing with counts, like `count(characteristic(nzxor))`, - # we can simply extract the feature and assume we're looking for `True` values. - Feature = parse_feature(term) - feature = Feature(True) - ensure_feature_valid_for_scope(scope, feature) - else: - # however, for remaining counted features, like `count(mnemonic(mov))`, - # we have to jump through hoops. - # - # when looking for the existance of such a feature, our rule might look like: - # - mnemonic: mov - # - # but here we deal with the form: `mnemonic(mov)`. - term, _, arg = term.partition('(') - Feature = parse_feature(term) + # when looking for the existence of such a feature, our rule might look like: + # - mnemonic: mov + # + # but here we deal with the form: `mnemonic(mov)`. + term, _, arg = term.partition('(') + Feature = parse_feature(term) - if arg: - arg = arg[:-len(')')] - # can't rely on yaml parsing ints embedded within strings - # like: - # - # count(offset(0xC)) - # count(number(0x11223344)) - # count(number(0x100 = symbol name)) - if term in ('number', 'offset', 'bytes'): - value, symbol = parse_symbol(arg, term) - feature = Feature(value, symbol) - else: - # arg is string, like: - # - # count(mnemonic(mov)) - # count(string(error)) - # TODO: what about embedded newlines? - feature = Feature(arg) + if arg: + arg = arg[:-len(')')] + # can't rely on yaml parsing ints embedded within strings + # like: + # + # count(offset(0xC)) + # count(number(0x11223344)) + # count(number(0x100 = description)) + if term != 'string': + value, description = parse_description(arg, term) + feature = Feature(value, description) else: - feature = Feature() - ensure_feature_valid_for_scope(scope, feature) + # arg is string (which doesn't support inline descriptions), like: + # + # count(string(error)) + # TODO: what about embedded newlines? + feature = Feature(arg) + else: + feature = Feature() + ensure_feature_valid_for_scope(scope, feature) count = d[key] if isinstance(count, int): @@ -370,13 +354,8 @@ def build_statements(d, scope): raise InvalidRule('invalid regular expression: %s it should use Python syntax, try it at https://pythex.org' % d[key]) else: Feature = parse_feature(key) - if key in ('number', 'offset', 'bytes'): - # parse numbers with symbol description, e.g. 0x4550 = IMAGE_DOS_SIGNATURE - # or regular numbers, e.g. 37 - value, symbol = parse_symbol(d[key], key) - feature = Feature(value, symbol) - else: - feature = Feature(d[key]) + value, description = parse_description(d[key], key, d.get('description')) + feature = Feature(value, description) ensure_feature_valid_for_scope(scope, feature) return feature diff --git a/rules b/rules index e5db2268..da61c913 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit e5db22684432c7fb951cd2bf4cde921f90e62f68 +Subproject commit da61c9138efee257bddfb5f68d1905578e11e23a diff --git a/tests/test_freeze.py b/tests/test_freeze.py index 29c05fde..ee51ba5d 100644 --- a/tests/test_freeze.py +++ b/tests/test_freeze.py @@ -12,23 +12,23 @@ from fixtures import * EXTRACTOR = capa.features.extractors.NullFeatureExtractor({ 'file features': [ - (0x402345, capa.features.Characteristic('embedded pe', True)), + (0x402345, capa.features.Characteristic('embedded pe')), ], 'functions': { 0x401000: { 'features': [ - (0x401000, capa.features.Characteristic('switch', True)), + (0x401000, capa.features.Characteristic('switch')), ], 'basic blocks': { 0x401000: { 'features': [ - (0x401000, capa.features.Characteristic('tight loop', True)), + (0x401000, capa.features.Characteristic('tight loop')), ], 'instructions': { 0x401000: { 'features': [ (0x401000, capa.features.insn.Mnemonic('xor')), - (0x401000, capa.features.Characteristic('nzxor', True)), + (0x401000, capa.features.Characteristic('nzxor')), ], }, 0x401002: { @@ -57,9 +57,9 @@ def test_null_feature_extractor(): scope: basic block features: - and: - - characteristic(tight loop): true + - characteristic: tight loop - mnemonic: xor - - characteristic(nzxor): true + - characteristic: nzxor ''')), ]) capabilities = capa.main.find_capabilities(rules, EXTRACTOR) @@ -150,7 +150,7 @@ def test_serialize_features(): roundtrip_feature(capa.features.insn.Offset(0x0)) roundtrip_feature(capa.features.insn.Mnemonic('push')) roundtrip_feature(capa.features.file.Section('.rsrc')) - roundtrip_feature(capa.features.Characteristic('tight loop', True)) + roundtrip_feature(capa.features.Characteristic('tight loop')) roundtrip_feature(capa.features.basicblock.BasicBlock()) roundtrip_feature(capa.features.file.Export('BaseThreadInitThunk')) roundtrip_feature(capa.features.file.Import('kernel32.IsWow64Process')) diff --git a/tests/test_main.py b/tests/test_main.py index 5c5640bc..da1834f6 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -42,7 +42,7 @@ def test_ruleset(): name: file rule scope: file features: - - characteristic(embedded pe): y + - characteristic: embedded pe ''')), capa.rules.Rule.from_yaml(textwrap.dedent(''' rule: @@ -50,7 +50,7 @@ def test_ruleset(): name: function rule scope: function features: - - characteristic(switch): y + - characteristic: switch ''')), capa.rules.Rule.from_yaml(textwrap.dedent(''' rule: @@ -58,7 +58,7 @@ def test_ruleset(): name: basic block rule scope: basic block features: - - characteristic(nzxor): y + - characteristic: nzxor ''')), ]) @@ -128,7 +128,7 @@ def test_match_across_scopes(sample_9324d1a8ae37a36ae560c37448c9705a): examples: - 9324d1a8ae37a36ae560c37448c9705a:0x403685 features: - - characteristic(tight loop): true + - characteristic: tight loop ''')), # this rule should match on a function (0x403660) # based on API, as well as prior basic block rule match @@ -176,7 +176,7 @@ def test_subscope_bb_rules(sample_9324d1a8ae37a36ae560c37448c9705a): features: - and: - basic block: - - characteristic(tight loop): true + - characteristic: tight loop ''')) ]) # tight loop at 0x403685 diff --git a/tests/test_rules.py b/tests/test_rules.py index 98bd2e79..7de66a0a 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -4,6 +4,7 @@ import pytest import capa.rules from capa.features.insn import Number, Offset +from capa.features import String def test_rule_ctor(): @@ -56,6 +57,22 @@ def test_rule_yaml_complex(): assert r.evaluate({Number(6): {1}, Number(7): {1}, Number(8): {1}}) == False +def test_rule_yaml_descriptions(): + rule = textwrap.dedent(''' + rule: + meta: + name: test rule + features: + - and: + - number: 1 = This is the number 1 + - string: This program cannot be run in DOS mode. + description: MS-DOS stub message + - count(number(2 = AF_INET/SOCK_DGRAM)): 2 + ''') + r = capa.rules.Rule.from_yaml(rule) + assert r.evaluate({Number(1): {1}, Number(2): {2, 3}, String('This program cannot be run in DOS mode.'): {4}}) == True + + def test_rule_yaml_not(): rule = textwrap.dedent(''' rule: @@ -118,7 +135,7 @@ def test_invalid_rule_feature(): name: test rule scope: file features: - - characteristic(nzxor): true + - characteristic: nzxor ''')) with pytest.raises(capa.rules.InvalidRule): @@ -128,7 +145,7 @@ def test_invalid_rule_feature(): name: test rule scope: function features: - - characteristic(embedded pe): true + - characteristic: embedded pe ''')) with pytest.raises(capa.rules.InvalidRule): @@ -138,7 +155,7 @@ def test_invalid_rule_feature(): name: test rule scope: basic block features: - - characteristic(embedded pe): true + - characteristic: embedded pe ''')) @@ -173,11 +190,11 @@ def test_subscope_rules(): scope: file features: - and: - - characteristic(embedded pe): true + - characteristic: embedded pe - function: - and: - - characteristic(nzxor): true - - characteristic(switch): true + - characteristic: nzxor + - characteristic: switch ''')) ]) # the file rule scope will have one rules: @@ -229,7 +246,7 @@ def test_invalid_rules(): meta: name: test rule features: - - characteristic(number(1)): True + - characteristic: number(1) ''')) with pytest.raises(capa.rules.InvalidRule): @@ -238,7 +255,7 @@ def test_invalid_rules(): meta: name: test rule features: - - characteristic(count(number(100))): True + - characteristic: count(number(100)) ''')) diff --git a/tests/test_viv_features.py b/tests/test_viv_features.py index ac0bac9d..5f68003a 100644 --- a/tests/test_viv_features.py +++ b/tests/test_viv_features.py @@ -116,7 +116,7 @@ def test_offset_features(mimikatz): def test_nzxor_features(mimikatz): features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x410DFC)) - assert capa.features.Characteristic('nzxor', True) in features # 0x0410F0B + assert capa.features.Characteristic('nzxor') in features # 0x0410F0B def get_bb_insn(f, va): @@ -154,7 +154,7 @@ def test_mnemonic_features(mimikatz): def test_peb_access_features(sample_a933a1a402775cfa94b6bee0963f4b46): features = extract_function_features(viv_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.vw, 0xABA6FEC)) - assert capa.features.Characteristic('peb access', True) in features + assert capa.features.Characteristic('peb access') in features def test_file_section_name_features(mimikatz): @@ -170,7 +170,7 @@ def test_tight_loop_features(mimikatz): if bb.va != 0x402F8E: continue features = extract_basic_block_features(f, bb) - assert capa.features.Characteristic('tight loop', True) in features + assert capa.features.Characteristic('tight loop') in features assert capa.features.basicblock.BasicBlock() in features @@ -180,7 +180,7 @@ def test_tight_loop_bb_features(mimikatz): if bb.va != 0x402F8E: continue features = extract_basic_block_features(f, bb) - assert capa.features.Characteristic('tight loop', True) in features + assert capa.features.Characteristic('tight loop') in features assert capa.features.basicblock.BasicBlock() in features @@ -202,17 +202,17 @@ def test_file_import_name_features(mimikatz): def test_cross_section_flow_features(sample_a198216798ca38f280dc413f8c57f2c2): features = extract_function_features(viv_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.vw, 0x4014D0)) - assert capa.features.Characteristic('cross section flow', True) in features + assert capa.features.Characteristic('cross section flow') in features # this function has calls to some imports, # which should not trigger cross-section flow characteristic features = extract_function_features(viv_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.vw, 0x401563)) - assert capa.features.Characteristic('cross section flow', True) not in features + assert capa.features.Characteristic('cross section flow') not in features def test_segment_access_features(sample_a933a1a402775cfa94b6bee0963f4b46): features = extract_function_features(viv_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.vw, 0xABA6FEC)) - assert capa.features.Characteristic('fs access', True) in features + assert capa.features.Characteristic('fs access') in features def test_thunk_features(sample_9324d1a8ae37a36ae560c37448c9705a): @@ -223,36 +223,36 @@ def test_thunk_features(sample_9324d1a8ae37a36ae560c37448c9705a): def test_file_embedded_pe(pma_lab_12_04): features = extract_file_features(pma_lab_12_04.vw, pma_lab_12_04.path) - assert capa.features.Characteristic('embedded pe', True) in features + assert capa.features.Characteristic('embedded pe') in features def test_stackstring_features(mimikatz): features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x4556E5)) - assert capa.features.Characteristic('stack string', True) in features + assert capa.features.Characteristic('stack string') in features def test_switch_features(mimikatz): features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x409411)) - assert capa.features.Characteristic('switch', True) in features + assert capa.features.Characteristic('switch') in features features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x409393)) - assert capa.features.Characteristic('switch', True) not in features + assert capa.features.Characteristic('switch') not in features def test_recursive_call_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41): features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10003100)) - assert capa.features.Characteristic('recursive call', True) in features + assert capa.features.Characteristic('recursive call') in features features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10007B00)) - assert capa.features.Characteristic('recursive call', True) not in features + assert capa.features.Characteristic('recursive call') not in features def test_loop_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41): features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10003D30)) - assert capa.features.Characteristic('loop', True) in features + assert capa.features.Characteristic('loop') in features features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10007250)) - assert capa.features.Characteristic('loop', True) not in features + assert capa.features.Characteristic('loop') not in features def test_file_string_features(sample_bfb9b5391a13d0afd787e87ab90f14f5): @@ -263,20 +263,20 @@ def test_file_string_features(sample_bfb9b5391a13d0afd787e87ab90f14f5): def test_function_calls_to(sample_9324d1a8ae37a36ae560c37448c9705a): features = extract_function_features(viv_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.vw, 0x406F60)) - assert capa.features.Characteristic('calls to', True) in features - assert len(features[capa.features.Characteristic('calls to', True)]) == 1 + assert capa.features.Characteristic('calls to') in features + assert len(features[capa.features.Characteristic('calls to')]) == 1 def test_function_calls_to64(sample_lab21_01): features = extract_function_features(viv_utils.Function(sample_lab21_01.vw, 0x1400052D0)) # memcpy - assert capa.features.Characteristic('calls to', True) in features - assert len(features[capa.features.Characteristic('calls to', True)]) == 8 + assert capa.features.Characteristic('calls to') in features + assert len(features[capa.features.Characteristic('calls to')]) == 8 def test_function_calls_from(sample_9324d1a8ae37a36ae560c37448c9705a): features = extract_function_features(viv_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.vw, 0x406F60)) - assert capa.features.Characteristic('calls from', True) in features - assert len(features[capa.features.Characteristic('calls from', True)]) == 23 + assert capa.features.Characteristic('calls from') in features + assert len(features[capa.features.Characteristic('calls from')]) == 23 def test_basic_block_count(sample_9324d1a8ae37a36ae560c37448c9705a): @@ -286,8 +286,8 @@ def test_basic_block_count(sample_9324d1a8ae37a36ae560c37448c9705a): def test_indirect_call_features(sample_a933a1a402775cfa94b6bee0963f4b46): features = extract_function_features(viv_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.vw, 0xABA68A0)) - assert capa.features.Characteristic('indirect call', True) in features - assert len(features[capa.features.Characteristic('indirect call', True)]) == 3 + assert capa.features.Characteristic('indirect call') in features + assert len(features[capa.features.Characteristic('indirect call')]) == 3 def test_indirect_calls_resolved(sample_c91887d861d9bd4a5872249b641bc9f9):