From 9aba2eb3a563a39a7356ff0739e79d02a9842857 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Tue, 30 Jun 2020 00:44:22 -0600 Subject: [PATCH 01/18] rules: range: correct handling of range with min==0 closes #57 --- capa/engine.py | 8 +++- tests/test_engine.py | 100 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 105 insertions(+), 3 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index 4be1e32c..0eb164f1 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -154,10 +154,14 @@ class Range(Statement): self.max = max if max is not None else (1 << 64 - 1) def evaluate(self, ctx): - if self.child not in ctx: + count = len(ctx.get(self.child, [])) + if self.min == 0: + if count == 0: + return Result(True, self, []) + elif self.child not in ctx: + # self.min > 0 so there needs to be more than zero matches return Result(False, self, []) - count = len(ctx[self.child]) return Result(self.min <= count <= self.max, self, [], locations=ctx[self.child]) def __str__(self): diff --git a/tests/test_engine.py b/tests/test_engine.py index 5c7c9a3c..5681ba64 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -65,7 +65,8 @@ def test_complex(): def test_range(): # unbounded range, but no matching feature - assert Range(Number(1)).evaluate({Number(2): {}}) == False + # since the lower bound is zero, and there are zero matches, ok + assert Range(Number(1)).evaluate({Number(2): {}}) == True # unbounded range with matching feature should always match assert Range(Number(1)).evaluate({Number(1): {}}) == True @@ -96,6 +97,103 @@ def test_range(): assert Range(Number(1), min=1, max=3).evaluate({Number(1): {1, 2, 3, 4}}) == False +def test_range_exact(): + rule = textwrap.dedent(''' + rule: + meta: + name: test rule + features: + - count(number(100)): 2 + ''') + r = capa.rules.Rule.from_yaml(rule) + + # just enough matches + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {1, 2}}, 0x0) + assert 'test rule' in matches + + # not enough matches + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {1}}, 0x0) + assert 'test rule' not in matches + + # too many matches + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {1, 2, 3}}, 0x0) + assert 'test rule' not in matches + + +def test_range_range(): + rule = textwrap.dedent(''' + rule: + meta: + name: test rule + features: + - count(number(100)): (2, 3) + ''') + r = capa.rules.Rule.from_yaml(rule) + + # just enough matches + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {1, 2}}, 0x0) + assert 'test rule' in matches + + # enough matches + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {1, 2, 3}}, 0x0) + assert 'test rule' in matches + + # not enough matches + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {1}}, 0x0) + assert 'test rule' not in matches + + # too many matches + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {1, 2, 3, 4}}, 0x0) + assert 'test rule' not in matches + + +def test_range_exact_zero(): + rule = textwrap.dedent(''' + rule: + meta: + name: test rule + features: + - count(number(100)): 0 + ''') + r = capa.rules.Rule.from_yaml(rule) + + # feature isn't indexed - good. + features, matches = capa.engine.match([r], {}, 0x0) + assert 'test rule' in matches + + # feature is indexed, but no matches. + # i don't think we should ever really have this case, but good to check anyways. + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {}}, 0x0) + assert 'test rule' in matches + + # too many matches + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {1}}, 0x0) + assert 'test rule' not in matches + + +def test_range_with_zero(): + rule = textwrap.dedent(''' + rule: + meta: + name: test rule + features: + - count(number(100)): (0, 1) + ''') + r = capa.rules.Rule.from_yaml(rule) + + # ok + features, matches = capa.engine.match([r], {}, 0x0) + assert 'test rule' in matches + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {}}, 0x0) + assert 'test rule' in matches + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {1}}, 0x0) + assert 'test rule' in matches + + # too many matches + features, matches = capa.engine.match([r], {capa.features.insn.Number(100): {1, 2}}, 0x0) + assert 'test rule' not in matches + + def test_match_adds_matched_rule_feature(): '''show that using `match` adds a feature for matched rules.''' rule = textwrap.dedent(''' From ebff65adc4e90a8affd8f95125d1e955906fbd99 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Tue, 30 Jun 2020 00:46:19 -0600 Subject: [PATCH 02/18] rules: range: simplify logic --- capa/engine.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index 0eb164f1..15a05796 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -155,12 +155,8 @@ class Range(Statement): def evaluate(self, ctx): count = len(ctx.get(self.child, [])) - if self.min == 0: - if count == 0: - return Result(True, self, []) - elif self.child not in ctx: - # self.min > 0 so there needs to be more than zero matches - return Result(False, self, []) + if self.min == 0 and count == 0: + return Result(True, self, []) return Result(self.min <= count <= self.max, self, [], locations=ctx[self.child]) From da9f8fede4cd6b7009799bf3fb459f9b7fff2f90 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Wed, 1 Jul 2020 12:05:43 -0600 Subject: [PATCH 03/18] render: json: include locations for range closes #43 --- capa/render/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/capa/render/__init__.py b/capa/render/__init__.py index cafa7aac..e53aca1f 100644 --- a/capa/render/__init__.py +++ b/capa/render/__init__.py @@ -1,6 +1,7 @@ import json import six +import capa.rules import capa.engine @@ -150,6 +151,9 @@ def convert_match_to_result_document(rules, capabilities, result): if isinstance(result.statement, capa.features.Feature): if bool(result.success): doc['locations'] = result.locations + elif isinstance(result.statement, capa.rules.Range): + if bool(result.success): + doc['locations'] = result.locations # if we have a `match` statement, then we're referencing another rule. # this could an external rule (written by a human), or From 6229377408795affef63ea41f6d0381142e55e58 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Wed, 1 Jul 2020 12:10:52 -0600 Subject: [PATCH 04/18] render: vverbose: factor out rendering of locations --- capa/render/vverbose.py | 51 +++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index e59a9596..46dd09f8 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -4,7 +4,26 @@ import capa.rules import capa.render.utils as rutils -def render_statement(ostream, statement, indent=0): +def render_locations(ostream, match): + # its possible to have an empty locations array here, + # such as when we're in MODE_FAILURE and showing the logic + # under a `not` statement (which will have no matched locations). + locations = list(sorted(match.get('locations', []))) + if len(locations) == 1: + ostream.write(' @ ') + ostream.write(rutils.hex(locations[0])) + elif len(locations) > 1: + ostream.write(' @ ') + if len(locations) > 4: + # don't display too many locations, because it becomes very noisy. + # probably only the first handful of locations will be useful for inspection. + ostream.write(', '.join(map(rutils.hex, locations[0:4]))) + ostream.write(', and %d more...' % (len(locations) - 4)) + else: + ostream.write(', '.join(map(rutils.hex, locations))) + + +def render_statement(ostream, match, statement, indent=0): ostream.write(' ' * indent) if statement['type'] in ('and', 'or', 'optional'): ostream.write(statement['type']) @@ -36,13 +55,16 @@ def render_statement(ostream, statement, indent=0): ostream.write('count(%s): ' % feature) if statement['max'] == statement['min']: - ostream.writeln('%d' % (statement['min'])) + ostream.write('%d' % (statement['min'])) elif statement['min'] == 0: - ostream.writeln('%d or fewer' % (statement['max'])) + ostream.write('%d or fewer' % (statement['max'])) elif statement['max'] == (1 << 64 - 1): - ostream.writeln('%d or more' % (statement['min'])) + ostream.write('%d or more' % (statement['min'])) else: - ostream.writeln('between %d and %d' % (statement['min'], statement['max'])) + ostream.write('between %d and %d' % (statement['min'], statement['max'])) + + render_locations(ostream, match) + ostream.write('\n') elif statement['type'] == 'subscope': ostream.write(statement['subscope']) ostream.writeln(':') @@ -77,29 +99,14 @@ def render_feature(ostream, match, feature, indent=0): else: raise RuntimeError('unexpected feature type: ' + str(feature)) - # its possible to have an empty locations array here, - # such as when we're in MODE_FAILURE and showing the logic - # under a `not` statement (which will have no matched locations). - locations = list(sorted(match.get('locations', []))) - if len(locations) == 1: - ostream.write(' @ ') - ostream.write(rutils.hex(locations[0])) - elif len(locations) > 1: - ostream.write(' @ ') - if len(locations) > 4: - # don't display too many locations, because it becomes very noisy. - # probably only the first handful of locations will be useful for inspection. - ostream.write(', '.join(map(rutils.hex, locations[0:4]))) - ostream.write(', and %d more...' % (len(locations) - 4)) - else: - ostream.write(', '.join(map(rutils.hex, locations))) + render_locations(ostream, match) ostream.write('\n') def render_node(ostream, match, node, indent=0): if node['type'] == 'statement': - render_statement(ostream, node['statement'], indent=indent) + render_statement(ostream, match, node['statement'], indent=indent) elif node['type'] == 'feature': render_feature(ostream, match, node['feature'], indent=indent) else: From 65f75e517a1a21ce5bbbbc6bf93d0d767287eddc Mon Sep 17 00:00:00 2001 From: Michael Hunhoff Date: Wed, 1 Jul 2020 13:26:00 -0600 Subject: [PATCH 05/18] adding support for multiple locations under range --- capa/ida/explorer/model.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/capa/ida/explorer/model.py b/capa/ida/explorer/model.py index 7b3b21c7..64dfb63a 100644 --- a/capa/ida/explorer/model.py +++ b/capa/ida/explorer/model.py @@ -298,11 +298,12 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): return item.childCount() - def render_capa_doc_statement_node(self, parent, statement, doc): + def render_capa_doc_statement_node(self, parent, statement, locations, doc): """ render capa statement read from doc @param parent: parent to which new child is assigned @param statement: statement read from doc + @param locations: locations of children (applies to range only?) @param doc: capa result doc "statement": { @@ -332,7 +333,13 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): else: display += 'between %d and %d' % (statement['min'], statement['max']) - return CapaExplorerFeatureItem(parent, display=display) + parent2 = CapaExplorerFeatureItem(parent, display=display) + + for location in locations: + # for each location render child node for range statement + self.render_capa_doc_feature(parent2, statement['child'], location, doc) + + return parent2 elif statement['type'] == 'subscope': return CapaExplorerFeatureItem(parent, 'subscope(%s)' % statement['subscope']) elif statement['type'] == 'regex': @@ -377,7 +384,8 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): return if match['node']['type'] == 'statement': - parent2 = self.render_capa_doc_statement_node(parent, match['node']['statement'], doc) + parent2 = self.render_capa_doc_statement_node(parent, match['node']['statement'], + match.get('locations', []), doc) elif match['node']['type'] == 'feature': parent2 = self.render_capa_doc_feature_node(parent, match['node']['feature'], match['locations'], doc) else: From 1cf36b5792dea74fd70d302513b62cee9d4013e6 Mon Sep 17 00:00:00 2001 From: Michael Hunhoff Date: Wed, 1 Jul 2020 13:38:20 -0600 Subject: [PATCH 06/18] adding new item type for subscope to help render --- capa/ida/explorer/item.py | 8 ++++++++ capa/ida/explorer/model.py | 7 ++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/capa/ida/explorer/item.py b/capa/ida/explorer/item.py index 729faa4a..72a955b7 100644 --- a/capa/ida/explorer/item.py +++ b/capa/ida/explorer/item.py @@ -184,6 +184,14 @@ class CapaExplorerFunctionItem(CapaExplorerDataItem): self._data[0] = self.fmt % display +class CapaExplorerSubscopeItem(CapaExplorerDataItem): + + fmt = 'subscope(%s)' + + def __init__(self, parent, scope): + super(CapaExplorerSubscopeItem, self).__init__(parent, [self.fmt % scope, '', '']) + + class CapaExplorerBlockItem(CapaExplorerDataItem): """ store data relevant to capa basic block result """ diff --git a/capa/ida/explorer/model.py b/capa/ida/explorer/model.py index 7b3b21c7..98d292b9 100644 --- a/capa/ida/explorer/model.py +++ b/capa/ida/explorer/model.py @@ -16,7 +16,8 @@ from capa.ida.explorer.item import ( CapaExplorerByteViewItem, CapaExplorerBlockItem, CapaExplorerRuleMatchItem, - CapaExplorerFeatureItem + CapaExplorerFeatureItem, + CapaExplorerSubscopeItem ) import capa.ida.helpers @@ -105,7 +106,7 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): if role == QtCore.Qt.FontRole and isinstance(item, (CapaExplorerRuleItem, CapaExplorerRuleMatchItem, CapaExplorerBlockItem, CapaExplorerFunctionItem, - CapaExplorerFeatureItem)) and \ + CapaExplorerFeatureItem, CapaExplorerSubscopeItem)) and \ column == CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION: # set bold font for top-level rules font = QtGui.QFont() @@ -334,7 +335,7 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): return CapaExplorerFeatureItem(parent, display=display) elif statement['type'] == 'subscope': - return CapaExplorerFeatureItem(parent, 'subscope(%s)' % statement['subscope']) + return CapaExplorerSubscopeItem(parent, statement['subscope']) elif statement['type'] == 'regex': # regex is a `Statement` not a `Feature` # this is because it doesn't get extracted, but applies to all strings in scope. From 28ac48dd174c54ae38ad65ee4da565230d37d1e0 Mon Sep 17 00:00:00 2001 From: Michael Hunhoff Date: Wed, 1 Jul 2020 13:54:00 -0600 Subject: [PATCH 07/18] add default sort order asc when reset occurs --- capa/ida/ida_capa_explorer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/capa/ida/ida_capa_explorer.py b/capa/ida/ida_capa_explorer.py index 873e0513..59bc2bb6 100644 --- a/capa/ida/ida_capa_explorer.py +++ b/capa/ida/ida_capa_explorer.py @@ -378,10 +378,14 @@ class CapaExplorerForm(idaapi.PluginForm): self.render_capa_doc_summary(doc) self.render_capa_doc_mitre_summary(doc) - self.view_tree.sortByColumn(CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION, QtCore.Qt.AscendingOrder) + self.set_view_tree_default_sort_order() logger.info('render views completed.') + def set_view_tree_default_sort_order(self): + """ """ + self.view_tree.sortByColumn(CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION, QtCore.Qt.AscendingOrder) + def render_capa_doc_summary(self, doc): """ """ for (row, rule) in enumerate(rutils.capability_rules(doc)): @@ -462,6 +466,7 @@ class CapaExplorerForm(idaapi.PluginForm): self.model_data.reset() self.view_tree.reset() self.view_checkbox_limit_by.setChecked(False) + self.set_view_tree_default_sort_order() def reload(self): """ reload views and re-run capa analysis """ From 36afed2d2281ef485fb3be5587b12e17727f4468 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Wed, 1 Jul 2020 17:35:18 -0600 Subject: [PATCH 08/18] pyinstaller working on linux/py2 closes #40 --- ci/hooks/hook-vivisect.py | 13 +++ ci/pyinstaller.spec | 182 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 ci/hooks/hook-vivisect.py create mode 100644 ci/pyinstaller.spec diff --git a/ci/hooks/hook-vivisect.py b/ci/hooks/hook-vivisect.py new file mode 100644 index 00000000..3714cfb0 --- /dev/null +++ b/ci/hooks/hook-vivisect.py @@ -0,0 +1,13 @@ +from PyInstaller.utils.hooks import copy_metadata + +# in order for viv-utils to use pkg_resources to fetch +# the installed version of vivisect, +# we need to instruct pyinstaller to embed this metadata. +# +# so we set the pyinstaller.spec/hookspath to reference +# the directory with this hook. +# +# this hook runs at analysis time and updates the embedded metadata. +# +# ref: https://github.com/pyinstaller/pyinstaller/issues/1713#issuecomment-162682084 +datas = copy_metadata('vivisect') diff --git a/ci/pyinstaller.spec b/ci/pyinstaller.spec new file mode 100644 index 00000000..a5e29f0a --- /dev/null +++ b/ci/pyinstaller.spec @@ -0,0 +1,182 @@ +# -*- mode: python -*- + +block_cipher = None + +a = Analysis( + ['../capa/main.py'], + pathex=['capa'], + binaries=None, + datas=[ + ('../rules', 'rules'), + ], + hiddenimports=[ + # vivisect does manual/runtime importing of its modules, + # so declare the things that could be imported here. + "pycparser", + "vivisect", + "vivisect.analysis", + "vivisect.analysis.amd64", + "vivisect.analysis.amd64", + "vivisect.analysis.amd64.emulation", + "vivisect.analysis.amd64.golang", + "vivisect.analysis.crypto", + "vivisect.analysis.crypto", + "vivisect.analysis.crypto.constants", + "vivisect.analysis.elf", + "vivisect.analysis.elf", + "vivisect.analysis.elf.elfplt", + "vivisect.analysis.elf.libc_start_main", + "vivisect.analysis.generic", + "vivisect.analysis.generic", + "vivisect.analysis.generic.codeblocks", + "vivisect.analysis.generic.emucode", + "vivisect.analysis.generic.entrypoints", + "vivisect.analysis.generic.funcentries", + "vivisect.analysis.generic.impapi", + "vivisect.analysis.generic.mkpointers", + "vivisect.analysis.generic.pointers", + "vivisect.analysis.generic.pointertables", + "vivisect.analysis.generic.relocations", + "vivisect.analysis.generic.strconst", + "vivisect.analysis.generic.switchcase", + "vivisect.analysis.generic.thunks", + "vivisect.analysis.i386", + "vivisect.analysis.i386", + "vivisect.analysis.i386.calling", + "vivisect.analysis.i386.golang", + "vivisect.analysis.i386.importcalls", + "vivisect.analysis.i386.instrhook", + "vivisect.analysis.i386.thunk_bx", + "vivisect.analysis.ms", + "vivisect.analysis.ms", + "vivisect.analysis.ms.hotpatch", + "vivisect.analysis.ms.localhints", + "vivisect.analysis.ms.msvc", + "vivisect.analysis.ms.msvcfunc", + "vivisect.analysis.ms.vftables", + "vivisect.analysis.pe", + "vivisect.impapi.posix.amd64", + "vivisect.impapi.posix.i386", + "vivisect.impapi.windows", + "vivisect.impapi.windows.amd64", + "vivisect.impapi.windows.i386", + "vivisect.parsers.blob", + "vivisect.parsers.elf", + "vivisect.parsers.ihex", + "vivisect.parsers.macho", + "vivisect.parsers.parse_pe", + "vivisect.parsers.utils", + "vivisect.storage", + "vivisect.storage.basicfile", + "vstruct.constants", + "vstruct.constants.ntstatus", + "vstruct.defs", + "vstruct.defs.arm7", + "vstruct.defs.bmp", + "vstruct.defs.dns", + "vstruct.defs.elf", + "vstruct.defs.gif", + "vstruct.defs.ihex", + "vstruct.defs.inet", + "vstruct.defs.java", + "vstruct.defs.kdcom", + "vstruct.defs.macho", + "vstruct.defs.macho.const", + "vstruct.defs.macho.fat", + "vstruct.defs.macho.loader", + "vstruct.defs.macho.stabs", + "vstruct.defs.minidump", + "vstruct.defs.pcap", + "vstruct.defs.pe", + "vstruct.defs.pptp", + "vstruct.defs.rar", + "vstruct.defs.swf", + "vstruct.defs.win32", + "vstruct.defs.windows", + "vstruct.defs.windows.win_5_1_i386", + "vstruct.defs.windows.win_5_1_i386.ntdll", + "vstruct.defs.windows.win_5_1_i386.ntoskrnl", + "vstruct.defs.windows.win_5_1_i386.win32k", + "vstruct.defs.windows.win_5_2_i386", + "vstruct.defs.windows.win_5_2_i386.ntdll", + "vstruct.defs.windows.win_5_2_i386.ntoskrnl", + "vstruct.defs.windows.win_5_2_i386.win32k", + "vstruct.defs.windows.win_6_1_amd64", + "vstruct.defs.windows.win_6_1_amd64.ntdll", + "vstruct.defs.windows.win_6_1_amd64.ntoskrnl", + "vstruct.defs.windows.win_6_1_amd64.win32k", + "vstruct.defs.windows.win_6_1_i386", + "vstruct.defs.windows.win_6_1_i386.ntdll", + "vstruct.defs.windows.win_6_1_i386.ntoskrnl", + "vstruct.defs.windows.win_6_1_i386.win32k", + "vstruct.defs.windows.win_6_1_wow64", + "vstruct.defs.windows.win_6_1_wow64.ntdll", + "vstruct.defs.windows.win_6_2_amd64", + "vstruct.defs.windows.win_6_2_amd64.ntdll", + "vstruct.defs.windows.win_6_2_amd64.ntoskrnl", + "vstruct.defs.windows.win_6_2_amd64.win32k", + "vstruct.defs.windows.win_6_2_i386", + "vstruct.defs.windows.win_6_2_i386.ntdll", + "vstruct.defs.windows.win_6_2_i386.ntoskrnl", + "vstruct.defs.windows.win_6_2_i386.win32k", + "vstruct.defs.windows.win_6_2_wow64", + "vstruct.defs.windows.win_6_2_wow64.ntdll", + "vstruct.defs.windows.win_6_3_amd64", + "vstruct.defs.windows.win_6_3_amd64.ntdll", + "vstruct.defs.windows.win_6_3_amd64.ntoskrnl", + "vstruct.defs.windows.win_6_3_i386", + "vstruct.defs.windows.win_6_3_i386.ntdll", + "vstruct.defs.windows.win_6_3_i386.ntoskrnl", + "vstruct.defs.windows.win_6_3_wow64", + "vstruct.defs.windows.win_6_3_wow64.ntdll", + ], + hookspath=['ci/hooks'], + runtime_hooks=None, + excludes=[ + # ignore packages that would otherwise be bundled with the .exe. + # review: build/pyinstaller/xref-pyinstaller.html + + # we don't do any GUI stuff, so ignore these modules + "tkinter", + "_tkinter", + "Tkinter", + # tqdm provides renderers for ipython, + # however, this drags in a lot of dependencies. + # since we don't spawn a notebook, we can safely remove these. + "IPython", + "ipywidgets", + ], + win_no_prefer_redirects=None, + win_private_assemblies=None, + cipher=block_cipher) + +a.binaries = a.binaries - TOC([ + ('tcl85.dll', None, None), + ('tk85.dll', None, None), + ('_tkinter', None, None)]) + +pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) + +exe = EXE(pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + exclude_binaries=False, + name='capa', + icon='logo.ico', + debug=False, + strip=None, + upx=True, + console=True ) + +# enable the following to debug the contents of the .exe +# +#coll = COLLECT(exe, +# a.binaries, +# a.zipfiles, +# a.datas, +# strip=None, +# upx=True, +# name='capa-dat') + From a80f38ec753d4b7c28d5dd0a32c3f6a98556373d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?= Date: Thu, 2 Jul 2020 09:24:00 +0200 Subject: [PATCH 09/18] Fix get_features rule linter To support matching namespaces `get_features()` get an extra parameters which was missing in the rule linter. --- scripts/lint.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/lint.py b/scripts/lint.py index cbbfcb71..a66ac2ea 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -318,7 +318,8 @@ def lint_features(ctx, rule): def get_features(ctx, rule): # get features from rule and all dependencies including subscopes and matched rules features = [] - deps = [ctx['rules'].rules[dep] for dep in rule.get_dependencies()] + namespaces = capa.rules.index_rules_by_namespace([rule]) + deps = [ctx['rules'].rules[dep] for dep in rule.get_dependencies(namespaces)] for r in [rule] + deps: features.extend(get_rule_features(r)) return features From d0e962f9d073a55a25a4a7eda33422d9e3258576 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 2 Jul 2020 01:27:07 -0600 Subject: [PATCH 10/18] pyinstaller: add wcwidth data files --- ci/pyinstaller.spec | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/ci/pyinstaller.spec b/ci/pyinstaller.spec index a5e29f0a..c62505ae 100644 --- a/ci/pyinstaller.spec +++ b/ci/pyinstaller.spec @@ -1,6 +1,6 @@ # -*- mode: python -*- - -block_cipher = None +import os.path +import wcwidth a = Analysis( ['../capa/main.py'], @@ -8,6 +8,13 @@ a = Analysis( binaries=None, datas=[ ('../rules', 'rules'), + # capa.render.default uses tabulate that depends on wcwidth. + # it seems wcwidth uses a json file `version.json` + # and this doesn't get picked up by pyinstaller automatically. + # so we manually embed the wcwidth resources here. + # + # ref: https://stackoverflow.com/a/62278462/87207 + (os.path.dirname(wcwidth.__file__), 'wcwidth') ], hiddenimports=[ # vivisect does manual/runtime importing of its modules, @@ -145,17 +152,14 @@ a = Analysis( # since we don't spawn a notebook, we can safely remove these. "IPython", "ipywidgets", - ], - win_no_prefer_redirects=None, - win_private_assemblies=None, - cipher=block_cipher) + ]) a.binaries = a.binaries - TOC([ ('tcl85.dll', None, None), ('tk85.dll', None, None), ('_tkinter', None, None)]) -pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) +pyz = PYZ(a.pure, a.zipped_data) exe = EXE(pyz, a.scripts, From d2ab09db5df906e60d5df4edb73ddd10480e3a9d Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 2 Jul 2020 01:38:06 -0600 Subject: [PATCH 11/18] pyinstaller: embed capa version in exe --- ci/pyinstaller.spec | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ci/pyinstaller.spec b/ci/pyinstaller.spec index c62505ae..785db528 100644 --- a/ci/pyinstaller.spec +++ b/ci/pyinstaller.spec @@ -1,7 +1,14 @@ # -*- mode: python -*- import os.path +import subprocess + import wcwidth + +with open('./capa/version.py', 'wb') as f: + f.write("__version__ = '%s'" + % subprocess.check_output(["git", "describe", "--always"]).strip()) + a = Analysis( ['../capa/main.py'], pathex=['capa'], From f8cfb67d028814fe6263febc1ffc24d99e6be68c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?= Date: Thu, 2 Jul 2020 11:10:28 +0200 Subject: [PATCH 12/18] Fix KeyError in Range#evaluate() If the key doesn't exist, `evaluate` raises a `KeyError` Exception, making the tests fail. --- capa/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/engine.py b/capa/engine.py index 15a05796..bdeff9ff 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -158,7 +158,7 @@ class Range(Statement): if self.min == 0 and count == 0: return Result(True, self, []) - return Result(self.min <= count <= self.max, self, [], locations=ctx[self.child]) + return Result(self.min <= count <= self.max, self, [], locations=ctx.get(self.child)) def __str__(self): if self.max == (1 << 64 - 1): From 767a76d8147fdb4b1c6849415c5515cb663d9bd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?= Date: Wed, 17 Jun 2020 08:43:04 +0200 Subject: [PATCH 13/18] Allow to add a description for every feature Enable associate context for all features. This was called symbol before and only enabled for `number`, `offset` and `bytes`. This is not enabled for strings with regular expressions, as they are not a feature. --- README.md | 49 ++++++++++++++++++++++++++++++------- capa/features/__init__.py | 51 +++++++++++++++++++++------------------ capa/features/file.py | 21 +++++----------- capa/features/insn.py | 33 +++++++++---------------- capa/rules.py | 37 ++++++++++++---------------- 5 files changed, 100 insertions(+), 91 deletions(-) diff --git a/README.md b/README.md index b8647373..318b6ba6 100644 --- a/README.md +++ b/README.md @@ -317,6 +317,25 @@ These are the features supported at the function-scope: - [mnemonic](#mnemonic) - [characteristics](#characteristics) +All of them support an optional description which helps with documenting rules and provides context in capa's output. +It can be specified in the following way: + +``` +- string: This program cannot be run in DOS mode. + description: MS-DOS stub message +- number: 0x4550 + description: IMAGE_DOS_SIGNATURE (MZ) +``` + +For all features except for [string](#string), the description can be specified inline preceded by ` = `. +For the previous [number](#number) example: + +``` +- number: 0x4550 = IMAGE_DOS_SIGNATURE (MZ) +``` + +The inline syntax is preferred (except for [string](#string)). + ### api A call to a named function, probably an import, though possibly a local function (like `malloc`) extracted via FLIRT. @@ -339,8 +358,8 @@ For example, a crypto constant. The parameter is a number; if prefixed with `0x` then in hex format, otherwise, decimal format. -To associate context with a number, e.g. for constant definitions, append an equal sign and the respective name to -the number definition. This helps with documenting rules and provides context in capa's output. +It can include an optional description, e.g. for constant definitions. +The inline syntax is preferred (` = DESCRIPTION STRING`). Examples: @@ -362,20 +381,29 @@ Regexes should be surrounded with `/` characters. By default, capa uses case-sensitive matching and assumes leading and trailing wildcards. To perform case-insensitive matching append an `i`. To anchor the regex at the start or end of a string, use `^` and/or `$`. +Strings can include a description, but the inline syntax is not supported. + Examples: - string: This program cannot be run in DOS mode. - string: Firefox 64.0 - string: /SELECT.*FROM.*WHERE/ - string: /Hardware\\Description\\System\\CentralProcessor/i - +``` +- string: This program cannot be run in DOS mode. + description: MS-DOS stub message +- string: '{3E5FC7F9-9A51-4367-9063-A120244FBEC7}' + description: CLSID_CMSTPLUA +- string: Firefox 64.0 +- string:'/SELECT.*FROM.*WHERE/ +- string: /Hardware\\Description\\System\\CentralProcessor/i +``` + Note that regex matching is expensive (`O(features)` rather than `O(1)`) so they should be used sparingly. ### bytes A sequence of bytes referenced by the logic of the program. The provided sequence must match from the beginning of the referenced bytes and be no more than `0x100` bytes. -The parameter is a sequence of hexadecimal bytes followed by an optional description. - +The parameter is a sequence of hexadecimal bytes. +It can include an optional description. +The inline syntax is preferred (` = DESCRIPTION STRING`). + The example below illustrates byte matching given a COM CLSID pushed onto the stack prior to `CoCreateInstance`. @@ -397,6 +425,7 @@ A structure offset referenced by the logic of the program. This should not be a stack offset. The parameter is a number; if prefixed with `0x` then in hex format, otherwise, decimal format. +It can be followed by an optional description. Examples: @@ -453,6 +482,8 @@ These are the features supported at the file-scope: - [import](#import) - [section](#section) +All of them can be followed by an optional description, as the features in the previous section. + ### file string An ASCII or UTF-16 LE string present in the file. diff --git a/capa/features/__init__.py b/capa/features/__init__.py index 9fec2d76..182cd514 100644 --- a/capa/features/__init__.py +++ b/capa/features/__init__.py @@ -17,10 +17,11 @@ def bytes_to_str(b): class Feature(object): - def __init__(self, args): + def __init__(self, args, description=None): super(Feature, self).__init__() self.name = self.__class__.__name__ self.args = args + self.description = description def __hash__(self): return hash((self.name, tuple(self.args))) @@ -28,8 +29,17 @@ class Feature(object): def __eq__(self, other): return self.name == other.name and self.args == other.args + def _str_name(self): + return self.name.lower() + + def _str_value(self): + return ','.join(self.args) + def __str__(self): - return '%s(%s)' % (self.name.lower(), ','.join(self.args)) + if self.description: + return '%s(%s = %s)' % (self._str_name(), self._str_value(), self.description) + else: + return '%s(%s)' % (self._str_name(), self._str_value()) def __repr__(self): return str(self) @@ -50,22 +60,22 @@ class Feature(object): class MatchedRule(Feature): - def __init__(self, rule_name): - super(MatchedRule, self).__init__([rule_name]) + def __init__(self, rule_name, description=None): + super(MatchedRule, self).__init__([rule_name], description) self.rule_name = rule_name - def __str__(self): - return 'match(%s)' % (self.rule_name) + def _str_name(self): + return 'match' class Characteristic(Feature): - def __init__(self, name, value=None): + def __init__(self, name, value=None, description=None): ''' when `value` is not provided, this serves as descriptor for a class of characteristics. this is only used internally, such as in `rules.py` when checking if a statement is supported by a given scope. ''' - super(Characteristic, self).__init__([name, value]) + super(Characteristic, self).__init__([name, value], description) self.name = name self.value = value @@ -74,27 +84,23 @@ class Characteristic(Feature): raise ValueError('cannot evaluate characteristc %s with empty value' % (str(self))) return super(Characteristic, self).evaluate(ctx) - def __str__(self): + def _str_value(self): if self.value is None: - return 'characteristic(%s)' % (self.name) + return self.name else: - return 'characteristic(%s(%s))' % (self.name, self.value) + return '%s(%s)' % (self.name, self.value) class String(Feature): - def __init__(self, value): - super(String, self).__init__([value]) + def __init__(self, value, description=None): + super(String, self).__init__([value], description) self.value = value - def __str__(self): - return 'string("%s")' % (self.value) - class Bytes(Feature): - def __init__(self, value, symbol=None): - super(Bytes, self).__init__([value]) + def __init__(self, value, description=None): + super(Bytes, self).__init__([value], description) self.value = value - self.symbol = symbol def evaluate(self, ctx): for feature, locations in ctx.items(): @@ -106,11 +112,8 @@ class Bytes(Feature): return capa.engine.Result(False, self, []) - def __str__(self): - if self.symbol: - return 'bytes(0x%s = %s)' % (bytes_to_str(self.value).upper(), self.symbol) - else: - return 'bytes(0x%s)' % (bytes_to_str(self.value).upper()) + def _str_value(self): + return '0x%s' % bytes_to_str(self.value).upper() def freeze_serialize(self): return (self.__class__.__name__, diff --git a/capa/features/file.py b/capa/features/file.py index 708b8e2b..396edd1f 100644 --- a/capa/features/file.py +++ b/capa/features/file.py @@ -2,30 +2,21 @@ from capa.features import Feature class Export(Feature): - def __init__(self, value): + def __init__(self, value, description=None): # value is export name - super(Export, self).__init__([value]) + super(Export, self).__init__([value], description) self.value = value - def __str__(self): - return 'Export(%s)' % (self.value) - class Import(Feature): - def __init__(self, value): + def __init__(self, value, description=None): # value is import name - super(Import, self).__init__([value]) + super(Import, self).__init__([value], description) self.value = value - def __str__(self): - return 'Import(%s)' % (self.value) - class Section(Feature): - def __init__(self, value): + def __init__(self, value, description=None): # value is section name - super(Section, self).__init__([value]) + super(Section, self).__init__([value], description) self.value = value - - def __str__(self): - return 'Section(%s)' % (self.value) diff --git a/capa/features/insn.py b/capa/features/insn.py index b8ebf9da..122bd0ae 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -2,45 +2,34 @@ from capa.features import Feature class API(Feature): - def __init__(self, name): + def __init__(self, name, description=None): # Downcase library name if given if '.' in name: modname, impname = name.split('.') name = modname.lower() + '.' + impname - super(API, self).__init__([name]) + super(API, self).__init__([name], description) class Number(Feature): - def __init__(self, value, symbol=None): - super(Number, self).__init__([value]) + def __init__(self, value, description=None): + super(Number, self).__init__([value], description) self.value = value - self.symbol = symbol - def __str__(self): - if self.symbol: - return 'number(0x%x = %s)' % (self.value, self.symbol) - else: - return 'number(0x%x)' % (self.value) + def _str_value(self): + return '0x%x' % self.value class Offset(Feature): - def __init__(self, value, symbol=None): + def __init__(self, value, description=None): super(Offset, self).__init__([value]) self.value = value - self.symbol = symbol - def __str__(self): - if self.symbol: - return 'offset(0x%x = %s)' % (self.value, self.symbol) - else: - return 'offset(0x%x)' % (self.value) + def _str_value(self): + return '0x%x' % self.value class Mnemonic(Feature): - def __init__(self, value): - super(Mnemonic, self).__init__([value]) + def __init__(self, value, description=None): + super(Mnemonic, self).__init__([value], description) self.value = value - - def __str__(self): - return 'mnemonic(%s)' % (self.value) diff --git a/capa/rules.py b/capa/rules.py index 8e42d0fc..7e2ffd6a 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -207,7 +207,7 @@ def parse_feature(key): return capa.features.basicblock.BasicBlock elif key.startswith('characteristic(') and key.endswith(')'): characteristic = key[len('characteristic('):-len(')')] - return lambda v: capa.features.Characteristic(characteristic, v) + return lambda v, description=None: capa.features.Characteristic(characteristic, v, description) elif key == 'export': return capa.features.file.Export elif key == 'import': @@ -220,18 +220,18 @@ def parse_feature(key): raise InvalidRule('unexpected statement: %s' % key) -def parse_symbol(s, value_type): +def parse_description(s, value_type, description=None): ''' s can be an int or a string ''' - if isinstance(s, str) and '=' in s: - value, symbol = s.split('=', 1) - symbol = symbol.strip() - if symbol == '': - raise InvalidRule('unexpected value: "%s", symbol name cannot be empty' % s) + if value_type != 'string' and isinstance(s, str) and ' = ' in s: + if description: + raise InvalidRule('unexpected value: "%s", only one description allowed (inline description with `=`)' % s) + value, description = s.split(' = ', 1) + if description == '': + raise InvalidRule('unexpected value: "%s", description cannot be empty' % s) else: value = s - symbol = None if isinstance(value, str): if value_type == 'bytes': @@ -244,17 +244,17 @@ def parse_symbol(s, value_type): if len(value) > MAX_BYTES_FEATURE_SIZE: raise InvalidRule('unexpected bytes value: byte sequences must be no larger than %s bytes' % MAX_BYTES_FEATURE_SIZE) - else: + elif value_type in ['number', 'offset']: try: value = parse_int(value) except ValueError: raise InvalidRule('unexpected value: "%s", must begin with numerical value' % value) - return value, symbol + return value, description def build_statements(d, scope): - if len(d.keys()) != 1: + if len(d.keys()) > 2: raise InvalidRule('too many statements') key = list(d.keys())[0] @@ -330,10 +330,10 @@ def build_statements(d, scope): # # count(offset(0xC)) # count(number(0x11223344)) - # count(number(0x100 = symbol name)) + # count(number(0x100 = description)) if term in ('number', 'offset', 'bytes'): - value, symbol = parse_symbol(arg, term) - feature = Feature(value, symbol) + value, description = parse_description(arg, term) + feature = Feature(value, description) else: # arg is string, like: # @@ -370,13 +370,8 @@ def build_statements(d, scope): raise InvalidRule('invalid regular expression: %s it should use Python syntax, try it at https://pythex.org' % d[key]) else: Feature = parse_feature(key) - if key in ('number', 'offset', 'bytes'): - # parse numbers with symbol description, e.g. 0x4550 = IMAGE_DOS_SIGNATURE - # or regular numbers, e.g. 37 - value, symbol = parse_symbol(d[key], key) - feature = Feature(value, symbol) - else: - feature = Feature(d[key]) + value, symbol = parse_description(d[key], key, d.get('description')) + feature = Feature(value, symbol) ensure_feature_valid_for_scope(scope, feature) return feature From 681feebf356d9d80bfb970c2136cb846eb6c1b10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?= Date: Tue, 30 Jun 2020 10:57:58 +0200 Subject: [PATCH 14/18] Adapt description implementation to new output As the `__str__` method is not used anymore in the output, the description implementation needs to be adapted. --- README.md | 58 +++++++++++++++++++++------------------ capa/features/__init__.py | 33 ++++++++++------------ capa/features/insn.py | 8 +++--- capa/render/__init__.py | 20 +++----------- capa/render/vverbose.py | 9 ++++-- capa/rules.py | 10 +++---- 6 files changed, 66 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index 318b6ba6..7f8d3845 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,7 @@ Download capa from the [Releases](/releases) page or get the nightly builds here - [section](#section) - [counting](#counting) - [matching prior rule matches](#matching-prior-rule-matches) + - [descriptions](#descriptions) - [limitations](#Limitations) # installation @@ -317,25 +318,6 @@ These are the features supported at the function-scope: - [mnemonic](#mnemonic) - [characteristics](#characteristics) -All of them support an optional description which helps with documenting rules and provides context in capa's output. -It can be specified in the following way: - -``` -- string: This program cannot be run in DOS mode. - description: MS-DOS stub message -- number: 0x4550 - description: IMAGE_DOS_SIGNATURE (MZ) -``` - -For all features except for [string](#string), the description can be specified inline preceded by ` = `. -For the previous [number](#number) example: - -``` -- number: 0x4550 = IMAGE_DOS_SIGNATURE (MZ) -``` - -The inline syntax is preferred (except for [string](#string)). - ### api A call to a named function, probably an import, though possibly a local function (like `malloc`) extracted via FLIRT. @@ -358,8 +340,9 @@ For example, a crypto constant. The parameter is a number; if prefixed with `0x` then in hex format, otherwise, decimal format. -It can include an optional description, e.g. for constant definitions. -The inline syntax is preferred (` = DESCRIPTION STRING`). +To help humans understand the meaning of a number, such that the constant `0x40` means `PAGE_EXECUTE_READWRITE`, you may provide a description alongside the definition. +Use the inline syntax (preferred) by ending the line with ` = DESCRIPTION STRING`. +Check the [description section](#description) for more details. Examples: @@ -381,7 +364,9 @@ Regexes should be surrounded with `/` characters. By default, capa uses case-sensitive matching and assumes leading and trailing wildcards. To perform case-insensitive matching append an `i`. To anchor the regex at the start or end of a string, use `^` and/or `$`. -Strings can include a description, but the inline syntax is not supported. +To add context to a string use the two-line syntax, using the `description` tag: `description: DESCRIPTION STRING`. +The inline syntax is not supported. +Check the [description section](#description) for more details. Examples: @@ -401,9 +386,9 @@ Note that regex matching is expensive (`O(features)` rather than `O(1)`) so they A sequence of bytes referenced by the logic of the program. The provided sequence must match from the beginning of the referenced bytes and be no more than `0x100` bytes. The parameter is a sequence of hexadecimal bytes. -It can include an optional description. -The inline syntax is preferred (` = DESCRIPTION STRING`). - +To help humans understand the meaning of the bytes sequence, you may provide a description. +Use the inline syntax (preferred) by ending the line with ` = DESCRIPTION STRING`. +Check the [description section](#description) for more details. The example below illustrates byte matching given a COM CLSID pushed onto the stack prior to `CoCreateInstance`. @@ -482,7 +467,6 @@ These are the features supported at the file-scope: - [import](#import) - [section](#section) -All of them can be followed by an optional description, as the features in the previous section. ### file string An ASCII or UTF-16 LE string present in the file. @@ -563,6 +547,28 @@ By default, library rules will not be output to the user as a rule match, but can be matched by other rules. When no active rules depend on a library rule, these the library rules will not be evaluated - maintaining performance. +## description + +All features support an optional description which helps with documenting rules and provides context in capa's output. +For all features except for [strings](#string), the description can be specified inline preceded by ` = `: ` = DESCRIPTION STRING`. +For example: + +``` +- number: 0x4550 = IMAGE_DOS_SIGNATURE (MZ) +``` + +The inline syntax is preferred. +For [strings](#string) or if the description is long or contains newlines, use the two-line syntax. +It uses the `description` tag in the following way: `description: DESCRIPTION STRING` +For example: + +``` +- string: This program cannot be run in DOS mode. + description: MS-DOS stub message +- number: 0x4550 + description: IMAGE_DOS_SIGNATURE (MZ) +``` + # limitations To learn more about capa's current limitations see [here](doc/limitations.md). diff --git a/capa/features/__init__.py b/capa/features/__init__.py index 182cd514..ee407291 100644 --- a/capa/features/__init__.py +++ b/capa/features/__init__.py @@ -19,7 +19,7 @@ def bytes_to_str(b): class Feature(object): def __init__(self, args, description=None): super(Feature, self).__init__() - self.name = self.__class__.__name__ + self.name = self.__class__.__name__.lower() self.args = args self.description = description @@ -29,17 +29,16 @@ class Feature(object): def __eq__(self, other): return self.name == other.name and self.args == other.args - def _str_name(self): - return self.name.lower() - - def _str_value(self): + # Used to overwrite the rendering of the feature args in `__str__` and the + # json output + def get_args_str(self): return ','.join(self.args) def __str__(self): if self.description: - return '%s(%s = %s)' % (self._str_name(), self._str_value(), self.description) + return '%s(%s = %s)' % (self.name, self.get_args_str(), self.description) else: - return '%s(%s)' % (self._str_name(), self._str_value()) + return '%s(%s)' % (self.name, self.get_args_str()) def __repr__(self): return str(self) @@ -62,21 +61,19 @@ class Feature(object): class MatchedRule(Feature): def __init__(self, rule_name, description=None): super(MatchedRule, self).__init__([rule_name], description) + self.name = 'match' self.rule_name = rule_name - def _str_name(self): - return 'match' - class Characteristic(Feature): - def __init__(self, name, value=None, description=None): + def __init__(self, attribute, value=None, description=None): ''' when `value` is not provided, this serves as descriptor for a class of characteristics. this is only used internally, such as in `rules.py` when checking if a statement is supported by a given scope. ''' - super(Characteristic, self).__init__([name, value], description) - self.name = name + super(Characteristic, self).__init__([attribute, value], description) + self.attribute = attribute self.value = value def evaluate(self, ctx): @@ -84,11 +81,11 @@ class Characteristic(Feature): raise ValueError('cannot evaluate characteristc %s with empty value' % (str(self))) return super(Characteristic, self).evaluate(ctx) - def _str_value(self): + def get_args_str(self): if self.value is None: - return self.name + return self.attribute else: - return '%s(%s)' % (self.name, self.value) + return '%s(%s)' % (self.attribute, self.value) class String(Feature): @@ -112,8 +109,8 @@ class Bytes(Feature): return capa.engine.Result(False, self, []) - def _str_value(self): - return '0x%s' % bytes_to_str(self.value).upper() + def get_args_str(self): + return bytes_to_str(self.value).upper() def freeze_serialize(self): return (self.__class__.__name__, diff --git a/capa/features/insn.py b/capa/features/insn.py index 122bd0ae..a353cb43 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -16,8 +16,8 @@ class Number(Feature): super(Number, self).__init__([value], description) self.value = value - def _str_value(self): - return '0x%x' % self.value + def get_args_str(self): + return '0x%X' % self.value class Offset(Feature): @@ -25,8 +25,8 @@ class Offset(Feature): super(Offset, self).__init__([value]) self.value = value - def _str_value(self): - return '0x%x' % self.value + def get_args_str(self): + return '0x%X' % self.value class Mnemonic(Feature): diff --git a/capa/render/__init__.py b/capa/render/__init__.py index e53aca1f..a1808f6e 100644 --- a/capa/render/__init__.py +++ b/capa/render/__init__.py @@ -86,23 +86,11 @@ def convert_feature_to_result_document(feature): "type": "characteristic" }, """ - name, value = feature.freeze_serialize() + result = {'type': feature.name, feature.name: feature.get_args_str()} + if feature.description: + result['description'] = feature.description - # make the terms pretty - name = name.lower() - if name == 'matchedrule': - name = 'match' - - # in the common case, there's a single argument - # so use it directly. - # like: name=number value=1 - if isinstance(value, list) and len(value) == 1: - value = value[0] - - return { - 'type': name, - name: value, - } + return result def convert_node_to_result_document(node): diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index 46dd09f8..0f0adc2a 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -48,7 +48,7 @@ def render_statement(ostream, match, statement, indent=0): elif child['type'] == 'bytes': feature = '%s(%s)' % (child['type'], rutils.bold2(rutils.hex_string(child[child['type']]))) elif child['type'] == 'characteristic': - feature = 'characteristic(%s)' % (rutils.bold2(child['characteristic'][0])) + feature = 'characteristic(%s)' % (rutils.bold2(child['characteristic'])) else: raise RuntimeError('unexpected feature type: ' + str(child)) @@ -94,13 +94,16 @@ def render_feature(ostream, match, feature, indent=0): # it should always be an even number of characters (its hex). ostream.write(rutils.bold2(rutils.hex_string(feature[feature['type']]))) elif feature['type'] == 'characteristic': - ostream.write('characteristic(%s)' % (rutils.bold2(feature['characteristic'][0]))) + ostream.write('characteristic(%s)' % (rutils.bold2(feature['characteristic']))) # note that regex is found in `render_statement` else: raise RuntimeError('unexpected feature type: ' + str(feature)) - render_locations(ostream, match) + if 'description' in feature: + ostream.write(' = ') + ostream.write(feature['description']) + render_locations(ostream, match) ostream.write('\n') diff --git a/capa/rules.py b/capa/rules.py index 7e2ffd6a..155f7127 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -138,7 +138,7 @@ class InvalidRuleSet(ValueError): def ensure_feature_valid_for_scope(scope, feature): if isinstance(feature, capa.features.Characteristic): - if capa.features.Characteristic(feature.name) not in SUPPORTED_FEATURES[scope]: + if capa.features.Characteristic(feature.attribute) not in SUPPORTED_FEATURES[scope]: raise InvalidRule('feature %s not support for scope %s' % (feature, scope)) elif not isinstance(feature, tuple(filter(lambda t: isinstance(t, type), SUPPORTED_FEATURES[scope]))): raise InvalidRule('feature %s not support for scope %s' % (feature, scope)) @@ -226,7 +226,7 @@ def parse_description(s, value_type, description=None): ''' if value_type != 'string' and isinstance(s, str) and ' = ' in s: if description: - raise InvalidRule('unexpected value: "%s", only one description allowed (inline description with `=`)' % s) + raise InvalidRule('unexpected value: "%s", only one description allowed (inline description with ` = `)' % s) value, description = s.split(' = ', 1) if description == '': raise InvalidRule('unexpected value: "%s", description cannot be empty' % s) @@ -244,7 +244,7 @@ def parse_description(s, value_type, description=None): if len(value) > MAX_BYTES_FEATURE_SIZE: raise InvalidRule('unexpected bytes value: byte sequences must be no larger than %s bytes' % MAX_BYTES_FEATURE_SIZE) - elif value_type in ['number', 'offset']: + elif value_type in {'number', 'offset'}: try: value = parse_int(value) except ValueError: @@ -370,8 +370,8 @@ def build_statements(d, scope): raise InvalidRule('invalid regular expression: %s it should use Python syntax, try it at https://pythex.org' % d[key]) else: Feature = parse_feature(key) - value, symbol = parse_description(d[key], key, d.get('description')) - feature = Feature(value, symbol) + value, description = parse_description(d[key], key, d.get('description')) + feature = Feature(value, description) ensure_feature_valid_for_scope(scope, feature) return feature From 64124c0b64563bcfa35084982388e7ed75c02cdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?= Date: Wed, 1 Jul 2020 19:40:50 +0200 Subject: [PATCH 15/18] Remove True from Characteristic rules and output Get rid of `True` in characteristic (rules, output and json) as it is implicit. This way, the same syntax is used for characteristic as for the rest of the features. Co-authored-by: William Ballenthin --- capa/features/__init__.py | 30 ++++----- capa/features/extractors/__init__.py | 8 +-- capa/features/extractors/ida/basicblock.py | 4 +- capa/features/extractors/ida/file.py | 2 +- capa/features/extractors/ida/function.py | 8 +-- capa/features/extractors/ida/insn.py | 14 ++--- capa/features/extractors/viv/basicblock.py | 4 +- capa/features/extractors/viv/file.py | 2 +- capa/features/extractors/viv/function.py | 6 +- capa/features/extractors/viv/insn.py | 26 ++++---- capa/render/vverbose.py | 8 +-- capa/rules.py | 73 +++++++++------------- rules | 2 +- tests/test_freeze.py | 14 ++--- tests/test_main.py | 10 +-- tests/test_rules.py | 16 ++--- tests/test_viv_features.py | 46 +++++++------- 17 files changed, 125 insertions(+), 148 deletions(-) diff --git a/capa/features/__init__.py b/capa/features/__init__.py index ee407291..28931ad9 100644 --- a/capa/features/__init__.py +++ b/capa/features/__init__.py @@ -66,26 +66,22 @@ class MatchedRule(Feature): class Characteristic(Feature): - def __init__(self, attribute, value=None, description=None): - ''' - when `value` is not provided, this serves as descriptor for a class of characteristics. - this is only used internally, such as in `rules.py` when checking if a statement is - supported by a given scope. - ''' - super(Characteristic, self).__init__([attribute, value], description) - self.attribute = attribute + def __init__(self, value, description=None): + super(Characteristic, self).__init__([value], description) self.value = value - def evaluate(self, ctx): - if self.value is None: - raise ValueError('cannot evaluate characteristc %s with empty value' % (str(self))) - return super(Characteristic, self).evaluate(ctx) + def freeze_serialize(self): + # in an older version of capa, characteristics could theoretically match non-existence (value=False). + # but we found this was never used (and better expressed with `not: characteristic: ...`). + # this was represented using an additional parameter for Characteristic. + # its been removed, but we keep it around in the freeze format to maintain backwards compatibility. + # this value is ignored, however. + return (self.__class__.__name__, [self.value, True]) - def get_args_str(self): - if self.value is None: - return self.attribute - else: - return '%s(%s)' % (self.attribute, self.value) + @classmethod + def freeze_deserialize(cls, args): + # see above. we ignore the second element in the 2-tuple here. + return cls(args[0]) class String(Feature): diff --git a/capa/features/extractors/__init__.py b/capa/features/extractors/__init__.py index 0486a63a..f8cfa941 100644 --- a/capa/features/extractors/__init__.py +++ b/capa/features/extractors/__init__.py @@ -184,22 +184,22 @@ class NullFeatureExtractor(FeatureExtractor): extractor = NullFeatureExtractor({ 'file features': [ - (0x402345, capa.features.Characteristic('embedded pe', True)), + (0x402345, capa.features.Characteristic('embedded pe')), ], 'functions': { 0x401000: { 'features': [ - (0x401000, capa.features.Characteristic('switch', True)), + (0x401000, capa.features.Characteristic('switch')), ], 'basic blocks': { 0x401000: { 'features': [ - (0x401000, capa.features.Characteristic('tight-loop', True)), + (0x401000, capa.features.Characteristic('tight-loop')), ], 'instructions': { 0x401000: { 'features': [ - (0x401000, capa.features.Characteristic('nzxor', True)), + (0x401000, capa.features.Characteristic('nzxor')), ], }, 0x401002: ... diff --git a/capa/features/extractors/ida/basicblock.py b/capa/features/extractors/ida/basicblock.py index 51ba648a..e4756390 100644 --- a/capa/features/extractors/ida/basicblock.py +++ b/capa/features/extractors/ida/basicblock.py @@ -103,7 +103,7 @@ def extract_bb_stackstring(f, bb): bb (IDA BasicBlock) ''' if _ida_bb_contains_stackstring(f, bb): - yield Characteristic('stack string', True), bb.start_ea + yield Characteristic('stack string'), bb.start_ea def _ida_bb_contains_tight_loop(f, bb): @@ -133,7 +133,7 @@ def extract_bb_tight_loop(f, bb): bb (IDA BasicBlock) ''' if _ida_bb_contains_tight_loop(f, bb): - yield Characteristic('tight loop', True), bb.start_ea + yield Characteristic('tight loop'), bb.start_ea def extract_features(f, bb): diff --git a/capa/features/extractors/ida/file.py b/capa/features/extractors/ida/file.py index f75bf148..4b00a84a 100644 --- a/capa/features/extractors/ida/file.py +++ b/capa/features/extractors/ida/file.py @@ -68,7 +68,7 @@ def extract_file_embedded_pe(): continue for ea, _ in _ida_check_segment_for_pe(seg): - yield Characteristic('embedded pe', True), ea + yield Characteristic('embedded pe'), ea def extract_file_export_names(): diff --git a/capa/features/extractors/ida/function.py b/capa/features/extractors/ida/function.py index 0712ec8b..c6f55d36 100644 --- a/capa/features/extractors/ida/function.py +++ b/capa/features/extractors/ida/function.py @@ -29,7 +29,7 @@ def extract_function_switch(f): f (IDA func_t) ''' if _ida_function_contains_switch(f): - yield Characteristic('switch', True), f.start_ea + yield Characteristic('switch'), f.start_ea def extract_function_calls_to(f): @@ -39,7 +39,7 @@ def extract_function_calls_to(f): f (IDA func_t) ''' for ea in idautils.CodeRefsTo(f.start_ea, True): - yield Characteristic('calls to', True), ea + yield Characteristic('calls to'), ea def extract_function_loop(f): @@ -53,7 +53,7 @@ def extract_function_loop(f): map(lambda s: edges.append((bb.start_ea, s.start_ea)), bb.succs()) if edges and loops.has_loop(edges): - yield Characteristic('loop', True), f.start_ea + yield Characteristic('loop'), f.start_ea def extract_recursive_call(f): @@ -64,7 +64,7 @@ def extract_recursive_call(f): ''' for ref in idautils.CodeRefsTo(f.start_ea, True): if f.contains(ref): - yield Characteristic('recursive call', True), f.start_ea + yield Characteristic('recursive call'), f.start_ea break diff --git a/capa/features/extractors/ida/insn.py b/capa/features/extractors/ida/insn.py index 3526d67d..59c7eeb3 100644 --- a/capa/features/extractors/ida/insn.py +++ b/capa/features/extractors/ida/insn.py @@ -259,7 +259,7 @@ def extract_insn_nzxor_characteristic_features(f, bb, insn): if _is_nzxor_stack_cookie(f, bb, insn): return - yield Characteristic('nzxor', True), insn.ea + yield Characteristic('nzxor'), insn.ea def extract_insn_mnemonic_features(f, bb, insn): @@ -292,7 +292,7 @@ def extract_insn_peb_access_characteristic_features(f, bb, insn): if ' fs:30h' in disasm or ' gs:60h' in disasm: # TODO: replace above with proper IDA - yield Characteristic('peb access', True), insn.ea + yield Characteristic('peb access'), insn.ea def extract_insn_segment_access_features(f, bb, insn): @@ -309,11 +309,11 @@ def extract_insn_segment_access_features(f, bb, insn): if ' fs:' in disasm: # TODO: replace above with proper IDA - yield Characteristic('fs access', True), insn.ea + yield Characteristic('fs access'), insn.ea if ' gs:' in disasm: # TODO: replace above with proper IDA - yield Characteristic('gs access', True), insn.ea + yield Characteristic('gs access'), insn.ea def extract_insn_cross_section_cflow(f, bb, insn): @@ -336,7 +336,7 @@ def extract_insn_cross_section_cflow(f, bb, insn): if idaapi.getseg(ref) == idaapi.getseg(insn.ea): continue - yield Characteristic('cross section flow', True), insn.ea + yield Characteristic('cross section flow'), insn.ea def extract_function_calls_from(f, bb, insn): @@ -354,7 +354,7 @@ def extract_function_calls_from(f, bb, insn): return for ref in idautils.CodeRefsFrom(insn.ea, False): - yield Characteristic('calls from', True), ref + yield Characteristic('calls from'), ref def extract_function_indirect_call_characteristic_features(f, bb, insn): @@ -373,7 +373,7 @@ def extract_function_indirect_call_characteristic_features(f, bb, insn): return if idc.get_operand_type(insn.ea, 0) in (idc.o_reg, idc.o_phrase, idc.o_displ): - yield Characteristic('indirect call', True), insn.ea + yield Characteristic('indirect call'), insn.ea def extract_features(f, bb, insn): diff --git a/capa/features/extractors/viv/basicblock.py b/capa/features/extractors/viv/basicblock.py index a7a6ef5c..ad1be20e 100644 --- a/capa/features/extractors/viv/basicblock.py +++ b/capa/features/extractors/viv/basicblock.py @@ -39,7 +39,7 @@ def _bb_has_tight_loop(f, bb): def extract_bb_tight_loop(f, bb): ''' check basic block for tight loop indicators ''' if _bb_has_tight_loop(f, bb): - yield Characteristic('tight loop', True), bb.va + yield Characteristic('tight loop'), bb.va def _bb_has_stackstring(f, bb): @@ -62,7 +62,7 @@ def _bb_has_stackstring(f, bb): def extract_stackstring(f, bb): ''' check basic block for stackstring indicators ''' if _bb_has_stackstring(f, bb): - yield Characteristic('stack string', True), bb.va + yield Characteristic('stack string'), bb.va def is_mov_imm_to_stack(instr): diff --git a/capa/features/extractors/viv/file.py b/capa/features/extractors/viv/file.py index 78678c77..4f35c601 100644 --- a/capa/features/extractors/viv/file.py +++ b/capa/features/extractors/viv/file.py @@ -13,7 +13,7 @@ def extract_file_embedded_pe(vw, file_path): fbytes = f.read() for offset, i in pe_carve.carve(fbytes, 1): - yield Characteristic('embedded pe', True), offset + yield Characteristic('embedded pe'), offset def extract_file_export_names(vw, file_path): diff --git a/capa/features/extractors/viv/function.py b/capa/features/extractors/viv/function.py index 43271c13..8efffac3 100644 --- a/capa/features/extractors/viv/function.py +++ b/capa/features/extractors/viv/function.py @@ -53,12 +53,12 @@ def extract_function_switch(f): method can be optimized ''' if f.va in get_functions_with_switch(f.vw): - yield Characteristic('switch', True), f.va + yield Characteristic('switch'), f.va def extract_function_calls_to(f): for src, _, _, _ in f.vw.getXrefsTo(f.va, rtype=vivisect.const.REF_CODE): - yield Characteristic('calls to', True), src + yield Characteristic('calls to'), src def extract_function_loop(f): @@ -74,7 +74,7 @@ def extract_function_loop(f): edges.append((bb.va, bva)) if edges and loops.has_loop(edges): - yield Characteristic('loop', True), f.va + yield Characteristic('loop'), f.va def extract_features(f): diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py index 08cc431a..1b4c68c2 100644 --- a/capa/features/extractors/viv/insn.py +++ b/capa/features/extractors/viv/insn.py @@ -286,7 +286,7 @@ def extract_insn_nzxor_characteristic_features(f, bb, insn): if is_security_cookie(f, bb, insn): return - yield Characteristic('nzxor', True), insn.va + yield Characteristic('nzxor'), insn.va def extract_insn_mnemonic_features(f, bb, insn): @@ -314,12 +314,12 @@ def extract_insn_peb_access_characteristic_features(f, bb, insn): # fs: push dword [eax + 0x30] ; i386RegMemOper, with eax = 0 if (isinstance(oper, envi.archs.i386.disasm.i386RegMemOper) and oper.disp == 0x30) or \ (isinstance(oper, envi.archs.i386.disasm.i386ImmMemOper) and oper.imm == 0x30): - yield Characteristic('peb access', True), insn.va + yield Characteristic('peb access'), insn.va elif 'gs' in insn.getPrefixName(): for oper in insn.opers: if (isinstance(oper, envi.archs.amd64.disasm.i386RegMemOper) and oper.disp == 0x60) or \ (isinstance(oper, envi.archs.amd64.disasm.i386ImmMemOper) and oper.imm == 0x60): - yield Characteristic('peb access', True), insn.va + yield Characteristic('peb access'), insn.va else: pass @@ -329,10 +329,10 @@ def extract_insn_segment_access_features(f, bb, insn): prefix = insn.getPrefixName() if prefix == 'fs': - yield Characteristic('fs access', True), insn.va + yield Characteristic('fs access'), insn.va if prefix == 'gs': - yield Characteristic('gs access', True), insn.va + yield Characteristic('gs access'), insn.va def get_section(vw, va): @@ -369,7 +369,7 @@ def extract_insn_cross_section_cflow(f, bb, insn): continue if get_section(f.vw, insn.va) != get_section(f.vw, va): - yield Characteristic('cross section flow', True), insn.va + yield Characteristic('cross section flow'), insn.va except KeyError: continue @@ -387,7 +387,7 @@ def extract_function_calls_from(f, bb, insn): if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper): oper = insn.opers[0] target = oper.getOperAddr(insn) - yield Characteristic('calls from', True), target + yield Characteristic('calls from'), target # call via thunk on x86, # see 9324d1a8ae37a36ae560c37448c9705a at 0x407985 @@ -396,18 +396,18 @@ def extract_function_calls_from(f, bb, insn): # see Lab21-01.exe_:0x140001178 elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386PcRelOper): target = insn.opers[0].getOperValue(insn) - yield Characteristic('calls from', True), target + yield Characteristic('calls from'), target # call via IAT, x64 elif isinstance(insn.opers[0], envi.archs.amd64.disasm.Amd64RipRelOper): op = insn.opers[0] target = op.getOperAddr(insn) - yield Characteristic('calls from', True), target + yield Characteristic('calls from'), target if target and target == f.va: # if we found a jump target and it's the function address # mark as recursive - yield Characteristic('recursive call', True), target + yield Characteristic('recursive call'), target # this is a feature that's most relevant at the function or basic block scope, @@ -423,13 +423,13 @@ def extract_function_indirect_call_characteristic_features(f, bb, insn): # Checks below work for x86 and x64 if isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper): # call edx - yield Characteristic('indirect call', True), insn.va + yield Characteristic('indirect call'), insn.va elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegMemOper): # call dword ptr [eax+50h] - yield Characteristic('indirect call', True), insn.va + yield Characteristic('indirect call'), insn.va elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386SibOper): # call qword ptr [rsp+78h] - yield Characteristic('indirect call', True), insn.va + yield Characteristic('indirect call'), insn.va def extract_features(f, bb, insn): diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index 0f0adc2a..9f6d700a 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -41,14 +41,12 @@ def render_statement(ostream, match, statement, indent=0): # so, we have to inline some of the feature rendering here. child = statement['child'] - if child['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match'): + if child['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match', 'characteristic'): feature = '%s(%s)' % (child['type'], rutils.bold2(child[child['type']])) elif child['type'] in ('number', 'offset'): feature = '%s(%s)' % (child['type'], rutils.bold2(rutils.hex(child[child['type']]))) elif child['type'] == 'bytes': feature = '%s(%s)' % (child['type'], rutils.bold2(rutils.hex_string(child[child['type']]))) - elif child['type'] == 'characteristic': - feature = 'characteristic(%s)' % (rutils.bold2(child['characteristic'])) else: raise RuntimeError('unexpected feature type: ' + str(child)) @@ -80,7 +78,7 @@ def render_statement(ostream, match, statement, indent=0): def render_feature(ostream, match, feature, indent=0): ostream.write(' ' * indent) - if feature['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match'): + if feature['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match', 'characteristic'): ostream.write(feature['type']) ostream.write(': ') ostream.write(rutils.bold2(feature[feature['type']])) @@ -93,8 +91,6 @@ def render_feature(ostream, match, feature, indent=0): # bytes is the uppercase, hex-encoded string. # it should always be an even number of characters (its hex). ostream.write(rutils.bold2(rutils.hex_string(feature[feature['type']]))) - elif feature['type'] == 'characteristic': - ostream.write('characteristic(%s)' % (rutils.bold2(feature['characteristic']))) # note that regex is found in `render_statement` else: raise RuntimeError('unexpected feature type: ' + str(feature)) diff --git a/capa/rules.py b/capa/rules.py index 155f7127..8c0f995c 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -138,7 +138,7 @@ class InvalidRuleSet(ValueError): def ensure_feature_valid_for_scope(scope, feature): if isinstance(feature, capa.features.Characteristic): - if capa.features.Characteristic(feature.attribute) not in SUPPORTED_FEATURES[scope]: + if capa.features.Characteristic(feature.value) not in SUPPORTED_FEATURES[scope]: raise InvalidRule('feature %s not support for scope %s' % (feature, scope)) elif not isinstance(feature, tuple(filter(lambda t: isinstance(t, type), SUPPORTED_FEATURES[scope]))): raise InvalidRule('feature %s not support for scope %s' % (feature, scope)) @@ -205,9 +205,8 @@ def parse_feature(key): return capa.features.insn.Mnemonic elif key == 'basic blocks': return capa.features.basicblock.BasicBlock - elif key.startswith('characteristic(') and key.endswith(')'): - characteristic = key[len('characteristic('):-len(')')] - return lambda v, description=None: capa.features.Characteristic(characteristic, v, description) + elif key == 'characteristic': + return capa.features.Characteristic elif key == 'export': return capa.features.file.Export elif key == 'import': @@ -302,48 +301,34 @@ def build_statements(d, scope): term = key[len('count('):-len(')')] - if term.startswith('characteristic('): - # characteristic features are specified a bit specially: - # they simply indicate the presence of something unusual/interesting, - # and we embed the name in the feature name, like `characteristic(nzxor)`. - # - # when we're dealing with counts, like `count(characteristic(nzxor))`, - # we can simply extract the feature and assume we're looking for `True` values. - Feature = parse_feature(term) - feature = Feature(True) - ensure_feature_valid_for_scope(scope, feature) - else: - # however, for remaining counted features, like `count(mnemonic(mov))`, - # we have to jump through hoops. - # - # when looking for the existance of such a feature, our rule might look like: - # - mnemonic: mov - # - # but here we deal with the form: `mnemonic(mov)`. - term, _, arg = term.partition('(') - Feature = parse_feature(term) + # when looking for the existence of such a feature, our rule might look like: + # - mnemonic: mov + # + # but here we deal with the form: `mnemonic(mov)`. + term, _, arg = term.partition('(') + Feature = parse_feature(term) - if arg: - arg = arg[:-len(')')] - # can't rely on yaml parsing ints embedded within strings - # like: - # - # count(offset(0xC)) - # count(number(0x11223344)) - # count(number(0x100 = description)) - if term in ('number', 'offset', 'bytes'): - value, description = parse_description(arg, term) - feature = Feature(value, description) - else: - # arg is string, like: - # - # count(mnemonic(mov)) - # count(string(error)) - # TODO: what about embedded newlines? - feature = Feature(arg) + if arg: + arg = arg[:-len(')')] + # can't rely on yaml parsing ints embedded within strings + # like: + # + # count(offset(0xC)) + # count(number(0x11223344)) + # count(number(0x100 = description)) + if term in ('number', 'offset', 'bytes'): + value, description = parse_description(arg, term) + feature = Feature(value, description) else: - feature = Feature() - ensure_feature_valid_for_scope(scope, feature) + # arg is string, like: + # + # count(mnemonic(mov)) + # count(string(error)) + # TODO: what about embedded newlines? + feature = Feature(arg) + else: + feature = Feature() + ensure_feature_valid_for_scope(scope, feature) count = d[key] if isinstance(count, int): diff --git a/rules b/rules index e5db2268..da61c913 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit e5db22684432c7fb951cd2bf4cde921f90e62f68 +Subproject commit da61c9138efee257bddfb5f68d1905578e11e23a diff --git a/tests/test_freeze.py b/tests/test_freeze.py index 29c05fde..ee51ba5d 100644 --- a/tests/test_freeze.py +++ b/tests/test_freeze.py @@ -12,23 +12,23 @@ from fixtures import * EXTRACTOR = capa.features.extractors.NullFeatureExtractor({ 'file features': [ - (0x402345, capa.features.Characteristic('embedded pe', True)), + (0x402345, capa.features.Characteristic('embedded pe')), ], 'functions': { 0x401000: { 'features': [ - (0x401000, capa.features.Characteristic('switch', True)), + (0x401000, capa.features.Characteristic('switch')), ], 'basic blocks': { 0x401000: { 'features': [ - (0x401000, capa.features.Characteristic('tight loop', True)), + (0x401000, capa.features.Characteristic('tight loop')), ], 'instructions': { 0x401000: { 'features': [ (0x401000, capa.features.insn.Mnemonic('xor')), - (0x401000, capa.features.Characteristic('nzxor', True)), + (0x401000, capa.features.Characteristic('nzxor')), ], }, 0x401002: { @@ -57,9 +57,9 @@ def test_null_feature_extractor(): scope: basic block features: - and: - - characteristic(tight loop): true + - characteristic: tight loop - mnemonic: xor - - characteristic(nzxor): true + - characteristic: nzxor ''')), ]) capabilities = capa.main.find_capabilities(rules, EXTRACTOR) @@ -150,7 +150,7 @@ def test_serialize_features(): roundtrip_feature(capa.features.insn.Offset(0x0)) roundtrip_feature(capa.features.insn.Mnemonic('push')) roundtrip_feature(capa.features.file.Section('.rsrc')) - roundtrip_feature(capa.features.Characteristic('tight loop', True)) + roundtrip_feature(capa.features.Characteristic('tight loop')) roundtrip_feature(capa.features.basicblock.BasicBlock()) roundtrip_feature(capa.features.file.Export('BaseThreadInitThunk')) roundtrip_feature(capa.features.file.Import('kernel32.IsWow64Process')) diff --git a/tests/test_main.py b/tests/test_main.py index 5c5640bc..da1834f6 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -42,7 +42,7 @@ def test_ruleset(): name: file rule scope: file features: - - characteristic(embedded pe): y + - characteristic: embedded pe ''')), capa.rules.Rule.from_yaml(textwrap.dedent(''' rule: @@ -50,7 +50,7 @@ def test_ruleset(): name: function rule scope: function features: - - characteristic(switch): y + - characteristic: switch ''')), capa.rules.Rule.from_yaml(textwrap.dedent(''' rule: @@ -58,7 +58,7 @@ def test_ruleset(): name: basic block rule scope: basic block features: - - characteristic(nzxor): y + - characteristic: nzxor ''')), ]) @@ -128,7 +128,7 @@ def test_match_across_scopes(sample_9324d1a8ae37a36ae560c37448c9705a): examples: - 9324d1a8ae37a36ae560c37448c9705a:0x403685 features: - - characteristic(tight loop): true + - characteristic: tight loop ''')), # this rule should match on a function (0x403660) # based on API, as well as prior basic block rule match @@ -176,7 +176,7 @@ def test_subscope_bb_rules(sample_9324d1a8ae37a36ae560c37448c9705a): features: - and: - basic block: - - characteristic(tight loop): true + - characteristic: tight loop ''')) ]) # tight loop at 0x403685 diff --git a/tests/test_rules.py b/tests/test_rules.py index 98bd2e79..18b1746f 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -118,7 +118,7 @@ def test_invalid_rule_feature(): name: test rule scope: file features: - - characteristic(nzxor): true + - characteristic: nzxor ''')) with pytest.raises(capa.rules.InvalidRule): @@ -128,7 +128,7 @@ def test_invalid_rule_feature(): name: test rule scope: function features: - - characteristic(embedded pe): true + - characteristic: embedded pe ''')) with pytest.raises(capa.rules.InvalidRule): @@ -138,7 +138,7 @@ def test_invalid_rule_feature(): name: test rule scope: basic block features: - - characteristic(embedded pe): true + - characteristic: embedded pe ''')) @@ -173,11 +173,11 @@ def test_subscope_rules(): scope: file features: - and: - - characteristic(embedded pe): true + - characteristic: embedded pe - function: - and: - - characteristic(nzxor): true - - characteristic(switch): true + - characteristic: nzxor + - characteristic: switch ''')) ]) # the file rule scope will have one rules: @@ -229,7 +229,7 @@ def test_invalid_rules(): meta: name: test rule features: - - characteristic(number(1)): True + - characteristic: number(1) ''')) with pytest.raises(capa.rules.InvalidRule): @@ -238,7 +238,7 @@ def test_invalid_rules(): meta: name: test rule features: - - characteristic(count(number(100))): True + - characteristic: count(number(100)) ''')) diff --git a/tests/test_viv_features.py b/tests/test_viv_features.py index ac0bac9d..5f68003a 100644 --- a/tests/test_viv_features.py +++ b/tests/test_viv_features.py @@ -116,7 +116,7 @@ def test_offset_features(mimikatz): def test_nzxor_features(mimikatz): features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x410DFC)) - assert capa.features.Characteristic('nzxor', True) in features # 0x0410F0B + assert capa.features.Characteristic('nzxor') in features # 0x0410F0B def get_bb_insn(f, va): @@ -154,7 +154,7 @@ def test_mnemonic_features(mimikatz): def test_peb_access_features(sample_a933a1a402775cfa94b6bee0963f4b46): features = extract_function_features(viv_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.vw, 0xABA6FEC)) - assert capa.features.Characteristic('peb access', True) in features + assert capa.features.Characteristic('peb access') in features def test_file_section_name_features(mimikatz): @@ -170,7 +170,7 @@ def test_tight_loop_features(mimikatz): if bb.va != 0x402F8E: continue features = extract_basic_block_features(f, bb) - assert capa.features.Characteristic('tight loop', True) in features + assert capa.features.Characteristic('tight loop') in features assert capa.features.basicblock.BasicBlock() in features @@ -180,7 +180,7 @@ def test_tight_loop_bb_features(mimikatz): if bb.va != 0x402F8E: continue features = extract_basic_block_features(f, bb) - assert capa.features.Characteristic('tight loop', True) in features + assert capa.features.Characteristic('tight loop') in features assert capa.features.basicblock.BasicBlock() in features @@ -202,17 +202,17 @@ def test_file_import_name_features(mimikatz): def test_cross_section_flow_features(sample_a198216798ca38f280dc413f8c57f2c2): features = extract_function_features(viv_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.vw, 0x4014D0)) - assert capa.features.Characteristic('cross section flow', True) in features + assert capa.features.Characteristic('cross section flow') in features # this function has calls to some imports, # which should not trigger cross-section flow characteristic features = extract_function_features(viv_utils.Function(sample_a198216798ca38f280dc413f8c57f2c2.vw, 0x401563)) - assert capa.features.Characteristic('cross section flow', True) not in features + assert capa.features.Characteristic('cross section flow') not in features def test_segment_access_features(sample_a933a1a402775cfa94b6bee0963f4b46): features = extract_function_features(viv_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.vw, 0xABA6FEC)) - assert capa.features.Characteristic('fs access', True) in features + assert capa.features.Characteristic('fs access') in features def test_thunk_features(sample_9324d1a8ae37a36ae560c37448c9705a): @@ -223,36 +223,36 @@ def test_thunk_features(sample_9324d1a8ae37a36ae560c37448c9705a): def test_file_embedded_pe(pma_lab_12_04): features = extract_file_features(pma_lab_12_04.vw, pma_lab_12_04.path) - assert capa.features.Characteristic('embedded pe', True) in features + assert capa.features.Characteristic('embedded pe') in features def test_stackstring_features(mimikatz): features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x4556E5)) - assert capa.features.Characteristic('stack string', True) in features + assert capa.features.Characteristic('stack string') in features def test_switch_features(mimikatz): features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x409411)) - assert capa.features.Characteristic('switch', True) in features + assert capa.features.Characteristic('switch') in features features = extract_function_features(viv_utils.Function(mimikatz.vw, 0x409393)) - assert capa.features.Characteristic('switch', True) not in features + assert capa.features.Characteristic('switch') not in features def test_recursive_call_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41): features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10003100)) - assert capa.features.Characteristic('recursive call', True) in features + assert capa.features.Characteristic('recursive call') in features features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10007B00)) - assert capa.features.Characteristic('recursive call', True) not in features + assert capa.features.Characteristic('recursive call') not in features def test_loop_feature(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41): features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10003D30)) - assert capa.features.Characteristic('loop', True) in features + assert capa.features.Characteristic('loop') in features features = extract_function_features(viv_utils.Function(sample_39c05b15e9834ac93f206bc114d0a00c357c888db567ba8f5345da0529cbed41.vw, 0x10007250)) - assert capa.features.Characteristic('loop', True) not in features + assert capa.features.Characteristic('loop') not in features def test_file_string_features(sample_bfb9b5391a13d0afd787e87ab90f14f5): @@ -263,20 +263,20 @@ def test_file_string_features(sample_bfb9b5391a13d0afd787e87ab90f14f5): def test_function_calls_to(sample_9324d1a8ae37a36ae560c37448c9705a): features = extract_function_features(viv_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.vw, 0x406F60)) - assert capa.features.Characteristic('calls to', True) in features - assert len(features[capa.features.Characteristic('calls to', True)]) == 1 + assert capa.features.Characteristic('calls to') in features + assert len(features[capa.features.Characteristic('calls to')]) == 1 def test_function_calls_to64(sample_lab21_01): features = extract_function_features(viv_utils.Function(sample_lab21_01.vw, 0x1400052D0)) # memcpy - assert capa.features.Characteristic('calls to', True) in features - assert len(features[capa.features.Characteristic('calls to', True)]) == 8 + assert capa.features.Characteristic('calls to') in features + assert len(features[capa.features.Characteristic('calls to')]) == 8 def test_function_calls_from(sample_9324d1a8ae37a36ae560c37448c9705a): features = extract_function_features(viv_utils.Function(sample_9324d1a8ae37a36ae560c37448c9705a.vw, 0x406F60)) - assert capa.features.Characteristic('calls from', True) in features - assert len(features[capa.features.Characteristic('calls from', True)]) == 23 + assert capa.features.Characteristic('calls from') in features + assert len(features[capa.features.Characteristic('calls from')]) == 23 def test_basic_block_count(sample_9324d1a8ae37a36ae560c37448c9705a): @@ -286,8 +286,8 @@ def test_basic_block_count(sample_9324d1a8ae37a36ae560c37448c9705a): def test_indirect_call_features(sample_a933a1a402775cfa94b6bee0963f4b46): features = extract_function_features(viv_utils.Function(sample_a933a1a402775cfa94b6bee0963f4b46.vw, 0xABA68A0)) - assert capa.features.Characteristic('indirect call', True) in features - assert len(features[capa.features.Characteristic('indirect call', True)]) == 3 + assert capa.features.Characteristic('indirect call') in features + assert len(features[capa.features.Characteristic('indirect call')]) == 3 def test_indirect_calls_resolved(sample_c91887d861d9bd4a5872249b641bc9f9): From 81741b49f7be843585e6c59559820ef7ef87e8dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?= Date: Thu, 2 Jul 2020 10:08:17 +0200 Subject: [PATCH 16/18] Support inline descriptions for count ``` count(number(2 = AF_INET/SOCK_DGRAM)): 2 ``` --- README.md | 4 ++++ capa/render/vverbose.py | 11 +++++++---- capa/rules.py | 5 ++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7f8d3845..bbc8fc0d 100644 --- a/README.md +++ b/README.md @@ -526,6 +526,10 @@ These rules can be expressed like: count(mnemonic(mov)): 3 count(basic block): 4 +`count` supports inline descriptions, except for [strings](#string), using the following syntax: + + count(number(2 = AF_INET/SOCK_DGRAM)): 2 + ## matching prior rule matches capa rules can specify logic for matching on other rule matches. diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index 9f6d700a..2e5a7221 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -42,15 +42,18 @@ def render_statement(ostream, match, statement, indent=0): child = statement['child'] if child['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match', 'characteristic'): - feature = '%s(%s)' % (child['type'], rutils.bold2(child[child['type']])) + value = rutils.bold2(child[child['type']]) elif child['type'] in ('number', 'offset'): - feature = '%s(%s)' % (child['type'], rutils.bold2(rutils.hex(child[child['type']]))) + value = rutils.bold2(rutils.hex(child[child['type']])) elif child['type'] == 'bytes': - feature = '%s(%s)' % (child['type'], rutils.bold2(rutils.hex_string(child[child['type']]))) + value = rutils.bold2(rutils.hex_string(child[child['type']])) else: raise RuntimeError('unexpected feature type: ' + str(child)) - ostream.write('count(%s): ' % feature) + if child['description']: + ostream.write('count(%s(%s = %s)): ' % (child['type'], value, child['description'])) + else: + ostream.write('count(%s(%s)): ' % (child['type'], value)) if statement['max'] == statement['min']: ostream.write('%d' % (statement['min'])) diff --git a/capa/rules.py b/capa/rules.py index 8c0f995c..7bcc6695 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -316,13 +316,12 @@ def build_statements(d, scope): # count(offset(0xC)) # count(number(0x11223344)) # count(number(0x100 = description)) - if term in ('number', 'offset', 'bytes'): + if term != 'string': value, description = parse_description(arg, term) feature = Feature(value, description) else: - # arg is string, like: + # arg is string (which doesn't support inline descriptions), like: # - # count(mnemonic(mov)) # count(string(error)) # TODO: what about embedded newlines? feature = Feature(arg) From 152129cc2567a30a7e61de0276113f08e2d06617 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?= Date: Thu, 2 Jul 2020 10:42:30 +0200 Subject: [PATCH 17/18] Add tests for description feature Test if the parsing of feature succeeds with every time of description. --- tests/test_rules.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_rules.py b/tests/test_rules.py index 18b1746f..7de66a0a 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -4,6 +4,7 @@ import pytest import capa.rules from capa.features.insn import Number, Offset +from capa.features import String def test_rule_ctor(): @@ -56,6 +57,22 @@ def test_rule_yaml_complex(): assert r.evaluate({Number(6): {1}, Number(7): {1}, Number(8): {1}}) == False +def test_rule_yaml_descriptions(): + rule = textwrap.dedent(''' + rule: + meta: + name: test rule + features: + - and: + - number: 1 = This is the number 1 + - string: This program cannot be run in DOS mode. + description: MS-DOS stub message + - count(number(2 = AF_INET/SOCK_DGRAM)): 2 + ''') + r = capa.rules.Rule.from_yaml(rule) + assert r.evaluate({Number(1): {1}, Number(2): {2, 3}, String('This program cannot be run in DOS mode.'): {4}}) == True + + def test_rule_yaml_not(): rule = textwrap.dedent(''' rule: From acbcd0c4b038456c48f13843036b190b59d1d617 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 2 Jul 2020 10:17:44 -0600 Subject: [PATCH 18/18] submodule: rules: update --- rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules b/rules index da61c913..bb1df027 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit da61c9138efee257bddfb5f68d1905578e11e23a +Subproject commit bb1df0277d5823179626388d3a2da81a03d6a723