From 623bac1a406de9f113fa479c034d401467aeefc2 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Fri, 5 Nov 2021 16:19:16 -0600 Subject: [PATCH 01/26] engine: statement: document that the order of children is important --- capa/engine.py | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index 7a2dea31..f64e1975 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -119,7 +119,13 @@ class Result: class And(Statement): - """match if all of the children evaluate to True.""" + """ + match if all of the children evaluate to True. + + the order of evaluation is dicated by the property + `And.children` (type: List[Statement|Feature]). + a query optimizer may safely manipulate the order of these children. + """ def __init__(self, children, description=None): super(And, self).__init__(description=description) @@ -129,13 +135,25 @@ class And(Statement): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.and"] += 1 - results = [child.evaluate(ctx) for child in self.children] - success = all(results) - return Result(success, self, results) + results = [] + for child in self.children: + result = child.evaluate(ctx) + results.append(result) + if not result: + # short circuit + return Result(False, self, results) + + return Result(True, self, results) class Or(Statement): - """match if any of the children evaluate to True.""" + """ + match if any of the children evaluate to True. + + the order of evaluation is dicated by the property + `Or.children` (type: List[Statement|Feature]). + a query optimizer may safely manipulate the order of these children. + """ def __init__(self, children, description=None): super(Or, self).__init__(description=description) @@ -167,7 +185,13 @@ class Not(Statement): class Some(Statement): - """match if at least N of the children evaluate to True.""" + """ + match if at least N of the children evaluate to True. + + the order of evaluation is dicated by the property + `Some.children` (type: List[Statement|Feature]). + a query optimizer may safely manipulate the order of these children. + """ def __init__(self, count, children, description=None): super(Some, self).__init__(description=description) From 8d9f418b2bc4305d04cd873e2f387a139a42701b Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Fri, 5 Nov 2021 16:20:22 -0600 Subject: [PATCH 02/26] rules: optimize by cost --- capa/rules.py | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/capa/rules.py b/capa/rules.py index 6960e02b..caafb428 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -961,6 +961,8 @@ class RuleSet: if len(rules) == 0: raise InvalidRuleSet("no rules selected") + rules = self._optimize_rules(rules) + self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE) self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE) self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE) @@ -1038,3 +1040,55 @@ class RuleSet: rules_filtered.update(set(capa.rules.get_rules_and_dependencies(rules, rule.name))) break return RuleSet(list(rules_filtered)) + + @staticmethod + def _get_node_cost(node): + if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)): + return 0 + + # elif "everything else": + # return 1 + # + # this should be all hash-lookup features. + # see below. + + elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)): + return 2 + + elif isinstance(node, (ceng.Not, ceng.Range)): + return 3 + + elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): + return 4 + + else: + # this should be all hash-lookup features. + return 1 + + @staticmethod + def _optimize_statement(statement): + # this routine operates in-place + + if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)): + # has .children + statement.children = sorted(statement.children, key=lambda n: -RuleSet._get_node_cost(n)) + return + elif isinstance(statement, (ceng.Not, ceng.Range)): + # has .child + RuleSet._optimize_statement(statement.child) + return + else: + # appears to be "simple" + return + + @staticmethod + def _optimize_rule(rule): + # operates in-place + RuleSet._optimize_statement(rule.statement) + + @staticmethod + def _optimize_rules(rules): + logger.debug("optimizing %d rules", len(rules)) + for rule in rules: + RuleSet._optimize_rule(rule) + return rules From 18ba986eba853108a0f48a274c61bb71ed904974 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Fri, 5 Nov 2021 16:32:12 -0600 Subject: [PATCH 03/26] engine: or: short circuit --- capa/engine.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index f64e1975..871119c9 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -162,10 +162,16 @@ class Or(Statement): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.or"] += 1 - - results = [child.evaluate(ctx) for child in self.children] - success = any(results) - return Result(success, self, results) + + results = [] + for child in self.children: + result = child.evaluate(ctx) + results.append(result) + if result: + # short circuit as soon as we hit one match + return Result(True, self, results) + + return Result(False, self, results) class Not(Statement): From a329147d28b159ae40e4686e8a5d1ed27d97fd77 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Fri, 5 Nov 2021 16:32:23 -0600 Subject: [PATCH 04/26] engine: some: short circuit --- capa/engine.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index 871119c9..0f78a6c1 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -207,14 +207,16 @@ class Some(Statement): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.some"] += 1 - - results = [child.evaluate(ctx) for child in self.children] - # note that here we cast the child result as a bool - # because we've overridden `__bool__` above. - # - # we can't use `if child is True` because the instance is not True. - success = sum([1 for child in results if bool(child) is True]) >= self.count - return Result(success, self, results) + + results = [] + for child in self.children: + result = child.evaluate(ctx) + results.append(result) + if len(results) >= self.count: + # short circuit as soon as we hit the threshold + return Result(True, self, results) + + return Result(False, self, results) class Range(Statement): From e63f072e409433af763c05891e5d15357e999214 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Fri, 5 Nov 2021 16:39:00 -0600 Subject: [PATCH 05/26] rules: optimizer: use recursive cost of statements --- capa/rules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/rules.py b/capa/rules.py index caafb428..57ebc791 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -1056,10 +1056,10 @@ class RuleSet: return 2 elif isinstance(node, (ceng.Not, ceng.Range)): - return 3 + return RuleSet._get_node_cost(node.child) elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): - return 4 + return sum(map(RuleSet._get_node_cost, node.children)) else: # this should be all hash-lookup features. From d573b83c947892d62ffa2118afe308d2954f0372 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Fri, 5 Nov 2021 16:49:38 -0600 Subject: [PATCH 06/26] rule: optimization: add some documentation --- capa/rules.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/capa/rules.py b/capa/rules.py index 57ebc791..038f4d73 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -1044,6 +1044,8 @@ class RuleSet: @staticmethod def _get_node_cost(node): if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)): + # we assume these are the most restrictive features: + # authors commonly use them at the start of rules to restrict the category of samples to inspect return 0 # elif "everything else": @@ -1053,16 +1055,26 @@ class RuleSet: # see below. elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)): + # substring and regex features require a full scan of each string + # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count). + # + # TODO: compute the average cost of these feature relative to hash feature + # and adjust the factor accordingly. return 2 elif isinstance(node, (ceng.Not, ceng.Range)): + # the cost of these nodes are defined by the complexity of their single child. return RuleSet._get_node_cost(node.child) elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): + # the cost of these nodes is the full cost of their children + # as this is the worst-case scenario. return sum(map(RuleSet._get_node_cost, node.children)) else: # this should be all hash-lookup features. + # we give this a arbitrary weight of 1. + # the only thing more "important" than this is checking OS/Arch/Format. return 1 @staticmethod @@ -1083,7 +1095,7 @@ class RuleSet: @staticmethod def _optimize_rule(rule): - # operates in-place + # this routine operates in-place RuleSet._optimize_statement(rule.statement) @staticmethod From d86c3f4d48557094acfd56b860a8b2f125384af1 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 11:50:16 -0700 Subject: [PATCH 07/26] common: move Result to capa.common from capa.engine fixes circular import error in capa.features.freeze --- capa/engine.py | 50 ++----------------------------- capa/features/common.py | 65 ++++++++++++++++++++++++++++++++++------- 2 files changed, 57 insertions(+), 58 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index 0f78a6c1..d7ff81f6 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -13,7 +13,7 @@ from typing import Set, Dict, List, Tuple, Union, Mapping, Iterable import capa.perf import capa.rules import capa.features.common -from capa.features.common import Feature +from capa.features.common import Result, Feature # a collection of features and the locations at which they are found. # @@ -46,15 +46,9 @@ class Statement: def __repr__(self): return str(self) - def evaluate(self, features: FeatureSet) -> "Result": + def evaluate(self, features: FeatureSet) -> Result: """ classes that inherit `Statement` must implement `evaluate` - - args: - ctx (defaultdict[Feature, set[VA]]) - - returns: - Result """ raise NotImplementedError() @@ -78,46 +72,6 @@ class Statement: children[i] = new -class Result: - """ - represents the results of an evaluation of statements against features. - - instances of this class should behave like a bool, - e.g. `assert Result(True, ...) == True` - - instances track additional metadata about evaluation results. - they contain references to the statement node (e.g. an And statement), - as well as the children Result instances. - - we need this so that we can render the tree of expressions and their results. - """ - - def __init__(self, success: bool, statement: Union[Statement, Feature], children: List["Result"], locations=None): - """ - args: - success (bool) - statement (capa.engine.Statement or capa.features.Feature) - children (list[Result]) - locations (iterable[VA]) - """ - super(Result, self).__init__() - self.success = success - self.statement = statement - self.children = children - self.locations = locations if locations is not None else () - - def __eq__(self, other): - if isinstance(other, bool): - return self.success == other - return False - - def __bool__(self): - return self.success - - def __nonzero__(self): - return self.success - - class And(Statement): """ match if all of the children evaluate to True. diff --git a/capa/features/common.py b/capa/features/common.py index 9fa5d8bf..0f01ef52 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -10,10 +10,9 @@ import re import codecs import logging import collections -from typing import Set, Dict, Union +from typing import Set, Dict, List, Union import capa.perf -import capa.engine import capa.features import capa.features.extractors.elf @@ -47,6 +46,52 @@ def escape_string(s: str) -> str: return s +class Result: + """ + represents the results of an evaluation of statements against features. + + instances of this class should behave like a bool, + e.g. `assert Result(True, ...) == True` + + instances track additional metadata about evaluation results. + they contain references to the statement node (e.g. an And statement), + as well as the children Result instances. + + we need this so that we can render the tree of expressions and their results. + """ + + def __init__( + self, + success: bool, + statement: Union["capa.engine.Statement", "Feature"], + children: List["Result"], + locations=None, + ): + """ + args: + success (bool) + statement (capa.engine.Statement or capa.features.Feature) + children (list[Result]) + locations (iterable[VA]) + """ + super(Result, self).__init__() + self.success = success + self.statement = statement + self.children = children + self.locations = locations if locations is not None else () + + def __eq__(self, other): + if isinstance(other, bool): + return self.success == other + return False + + def __bool__(self): + return self.success + + def __nonzero__(self): + return self.success + + class Feature: def __init__(self, value: Union[str, int, bytes], bitness=None, description=None): """ @@ -97,10 +142,10 @@ class Feature: def __repr__(self): return str(self) - def evaluate(self, ctx: Dict["Feature", Set[int]]) -> "capa.engine.Result": + def evaluate(self, ctx: Dict["Feature", Set[int]]) -> "Result": capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature." + self.name] += 1 - return capa.engine.Result(self in ctx, self, [], locations=ctx.get(self, [])) + return Result(self in ctx, self, [], locations=ctx.get(self, [])) def freeze_serialize(self): if self.bitness is not None: @@ -176,9 +221,9 @@ class Substring(String): # unlike other features, we cannot return put a reference to `self` directly in a `Result`. # this is because `self` may match on many strings, so we can't stuff the matched value into it. # instead, return a new instance that has a reference to both the substring and the matched values. - return capa.engine.Result(True, _MatchedSubstring(self, matches), [], locations=locations) + return Result(True, _MatchedSubstring(self, matches), [], locations=locations) else: - return capa.engine.Result(False, _MatchedSubstring(self, None), []) + return Result(False, _MatchedSubstring(self, None), []) def __str__(self): return "substring(%s)" % self.value @@ -269,9 +314,9 @@ class Regex(String): # this is because `self` may match on many strings, so we can't stuff the matched value into it. # instead, return a new instance that has a reference to both the regex and the matched values. # see #262. - return capa.engine.Result(True, _MatchedRegex(self, matches), [], locations=locations) + return Result(True, _MatchedRegex(self, matches), [], locations=locations) else: - return capa.engine.Result(False, _MatchedRegex(self, None), []) + return Result(False, _MatchedRegex(self, None), []) def __str__(self): return "regex(string =~ %s)" % self.value @@ -326,9 +371,9 @@ class Bytes(Feature): continue if feature.value.startswith(self.value): - return capa.engine.Result(True, self, [], locations=locations) + return Result(True, self, [], locations=locations) - return capa.engine.Result(False, self, []) + return Result(False, self, []) def get_value_str(self): return hex_string(bytes_to_str(self.value)) From 35fa50dbee0f4d0e2d10689abd6539795f0d5dd0 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 11:50:37 -0700 Subject: [PATCH 08/26] pep8 --- capa/engine.py | 20 ++++++++++---------- capa/features/common.py | 10 +++++----- capa/rules.py | 6 +++--- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index d7ff81f6..601ddd34 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -87,11 +87,11 @@ class And(Statement): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.and"] += 1 + capa.perf.counters["evaluate.feature.and"] += 1 results = [] for child in self.children: - result = child.evaluate(ctx) + result = child.evaluate(ctx) results.append(result) if not result: # short circuit @@ -103,7 +103,7 @@ class And(Statement): class Or(Statement): """ match if any of the children evaluate to True. - + the order of evaluation is dicated by the property `Or.children` (type: List[Statement|Feature]). a query optimizer may safely manipulate the order of these children. @@ -115,7 +115,7 @@ class Or(Statement): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.or"] += 1 + capa.perf.counters["evaluate.feature.or"] += 1 results = [] for child in self.children: @@ -137,8 +137,8 @@ class Not(Statement): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.not"] += 1 - + capa.perf.counters["evaluate.feature.not"] += 1 + results = [self.child.evaluate(ctx)] success = not results[0] return Result(success, self, results) @@ -160,8 +160,8 @@ class Some(Statement): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.some"] += 1 - + capa.perf.counters["evaluate.feature.some"] += 1 + results = [] for child in self.children: result = child.evaluate(ctx) @@ -184,8 +184,8 @@ class Range(Statement): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.range"] += 1 - + capa.perf.counters["evaluate.feature.range"] += 1 + count = len(ctx.get(self.child, [])) if self.min == 0 and count == 0: return Result(True, self, []) diff --git a/capa/features/common.py b/capa/features/common.py index 0f01ef52..a40201e3 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -190,7 +190,7 @@ class Substring(String): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.substring"] += 1 + capa.perf.counters["evaluate.feature.substring"] += 1 # mapping from string value to list of locations. # will unique the locations later on. @@ -278,8 +278,8 @@ class Regex(String): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.regex"] += 1 - + capa.perf.counters["evaluate.feature.regex"] += 1 + # mapping from string value to list of locations. # will unique the locations later on. matches = collections.defaultdict(list) @@ -364,8 +364,8 @@ class Bytes(Feature): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.bytes"] += 1 - + capa.perf.counters["evaluate.feature.bytes"] += 1 + for feature, locations in ctx.items(): if not isinstance(feature, (Bytes,)): continue diff --git a/capa/rules.py b/capa/rules.py index 038f4d73..b49f7ee1 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -622,7 +622,7 @@ class Rule: def evaluate(self, features: FeatureSet): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.rule"] += 1 + capa.perf.counters["evaluate.feature.rule"] += 1 return self.statement.evaluate(features) @classmethod @@ -1053,7 +1053,7 @@ class RuleSet: # # this should be all hash-lookup features. # see below. - + elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)): # substring and regex features require a full scan of each string # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count). @@ -1070,7 +1070,7 @@ class RuleSet: # the cost of these nodes is the full cost of their children # as this is the worst-case scenario. return sum(map(RuleSet._get_node_cost, node.children)) - + else: # this should be all hash-lookup features. # we give this a arbitrary weight of 1. From a995b53c380edc345757d50060e9d768661de41c Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 11:50:49 -0700 Subject: [PATCH 09/26] perf: add reset routine --- capa/perf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/capa/perf.py b/capa/perf.py index d1e4083d..c7b416c0 100644 --- a/capa/perf.py +++ b/capa/perf.py @@ -1,3 +1,8 @@ import collections counters = collections.Counter() + + +def reset(): + global counters + counters = collections.Counter() From 480df323e5153a7cb89403c2ef0c657f514b8d69 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 11:51:09 -0700 Subject: [PATCH 10/26] scripts: add py script for profiling time --- scripts/profile-time.py | 115 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 scripts/profile-time.py diff --git a/scripts/profile-time.py b/scripts/profile-time.py new file mode 100644 index 00000000..3c47b67b --- /dev/null +++ b/scripts/profile-time.py @@ -0,0 +1,115 @@ +import sys +import timeit +import logging +import argparse +import subprocess + +import tqdm +import tabulate + +import capa.main +import capa.perf +import capa.rules +import capa.engine +import capa.helpers +import capa.features +import capa.features.common +import capa.features.freeze + +logger = logging.getLogger("capa.profile") + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + label = subprocess.run( + "git show --pretty=oneline --abbrev-commit | head -n 1", shell=True, capture_output=True, text=True + ).stdout.strip() + is_dirty = ( + subprocess.run( + "git status | grep 'modified: ' | grep -v 'rules' | grep -v 'tests/data'", + shell=True, + capture_output=True, + text=True, + ).stdout + != "" + ) + + if is_dirty: + label += " (dirty)" + + parser = argparse.ArgumentParser(description="Profile capa performance") + capa.main.install_common_args(parser, wanted={"format", "sample", "signatures", "rules"}) + + parser.add_argument("--number", type=int, default=3, help="batch size of profile collection") + parser.add_argument("--repeat", type=int, default=30, help="batch count of profile collection") + parser.add_argument("--label", type=str, default=label, help="description of the profile collection") + + args = parser.parse_args(args=argv) + capa.main.handle_common_args(args) + + try: + taste = capa.helpers.get_file_taste(args.sample) + except IOError as e: + logger.error("%s", str(e)) + return -1 + + try: + with capa.main.timing("load rules"): + rules = capa.rules.RuleSet(capa.main.get_rules(args.rules, disable_progress=True)) + except (IOError) as e: + logger.error("%s", str(e)) + return -1 + + try: + sig_paths = capa.main.get_signatures(args.signatures) + except (IOError) as e: + logger.error("%s", str(e)) + return -1 + + if (args.format == "freeze") or (args.format == "auto" and capa.features.freeze.is_freeze(taste)): + with open(args.sample, "rb") as f: + extractor = capa.features.freeze.load(f.read()) + else: + extractor = capa.main.get_extractor( + args.sample, args.format, capa.main.BACKEND_VIV, sig_paths, should_save_workspace=False + ) + + with tqdm.tqdm(total=args.number * args.repeat) as pbar: + + def do_iteration(): + capa.perf.reset() + capa.main.find_capabilities(rules, extractor, disable_progress=True) + pbar.update(1) + + samples = timeit.repeat(do_iteration, number=args.number, repeat=args.repeat) + + logger.debug("perf: find capabilities: min: %0.2fs" % (min(samples) / float(args.number))) + logger.debug("perf: find capabilities: avg: %0.2fs" % (sum(samples) / float(args.repeat) / float(args.number))) + logger.debug("perf: find capabilities: max: %0.2fs" % (max(samples) / float(args.number))) + + for (counter, count) in capa.perf.counters.most_common(): + logger.debug("perf: counter: {:}: {:,}".format(counter, count)) + + print( + tabulate.tabulate( + [ + ( + args.label, + "{:,}".format(capa.perf.counters["evaluate.feature"]), + "%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)), + "%0.2fs" % (min(samples) / float(args.number)), + "%0.2fs" % (max(samples) / float(args.number)), + ) + ], + headers=["label", "count(evaluations)", "avg(time)", "min(time)", "max(time)"], + tablefmt="github", + ) + ) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From e3496b0660d0c25318770e9ecd54b2c83b74928f Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 12:10:22 -0700 Subject: [PATCH 11/26] engine: move optimizer into its own module --- capa/optimizer.py | 70 +++++++++++++++++++++++++++++++++++++++++++++++ capa/rules.py | 67 ++------------------------------------------- 2 files changed, 72 insertions(+), 65 deletions(-) create mode 100644 capa/optimizer.py diff --git a/capa/optimizer.py b/capa/optimizer.py new file mode 100644 index 00000000..462bdf0f --- /dev/null +++ b/capa/optimizer.py @@ -0,0 +1,70 @@ +import logging + +import capa.engine as ceng +import capa.features.common + +logger = logging.getLogger(__name__) + + +def get_node_cost(node): + if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)): + # we assume these are the most restrictive features: + # authors commonly use them at the start of rules to restrict the category of samples to inspect + return 0 + + # elif "everything else": + # return 1 + # + # this should be all hash-lookup features. + # see below. + + elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)): + # substring and regex features require a full scan of each string + # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count). + # + # TODO: compute the average cost of these feature relative to hash feature + # and adjust the factor accordingly. + return 2 + + elif isinstance(node, (ceng.Not, ceng.Range)): + # the cost of these nodes are defined by the complexity of their single child. + return get_node_cost(node.child) + + elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): + # the cost of these nodes is the full cost of their children + # as this is the worst-case scenario. + return sum(map(get_node_cost, node.children)) + + else: + # this should be all hash-lookup features. + # we give this a arbitrary weight of 1. + # the only thing more "important" than this is checking OS/Arch/Format. + return 1 + + +def optimize_statement(statement): + # this routine operates in-place + + if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)): + # has .children + statement.children = sorted(statement.children, key=lambda n: -get_node_cost(n)) + return + elif isinstance(statement, (ceng.Not, ceng.Range)): + # has .child + optimize_statement(statement.child) + return + else: + # appears to be "simple" + return + + +def optimize_rule(rule): + # this routine operates in-place + optimize_statement(rule.statement) + + +def optimize_rules(rules): + logger.debug("optimizing %d rules", len(rules)) + for rule in rules: + optimize_rule(rule) + return rules diff --git a/capa/rules.py b/capa/rules.py index b49f7ee1..fb550ad4 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -30,6 +30,7 @@ import ruamel.yaml import capa.perf import capa.engine as ceng import capa.features +import capa.optimizer import capa.features.file import capa.features.insn import capa.features.common @@ -961,7 +962,7 @@ class RuleSet: if len(rules) == 0: raise InvalidRuleSet("no rules selected") - rules = self._optimize_rules(rules) + rules = capa.optimizer.optimize_rules(rules) self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE) self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE) @@ -1040,67 +1041,3 @@ class RuleSet: rules_filtered.update(set(capa.rules.get_rules_and_dependencies(rules, rule.name))) break return RuleSet(list(rules_filtered)) - - @staticmethod - def _get_node_cost(node): - if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)): - # we assume these are the most restrictive features: - # authors commonly use them at the start of rules to restrict the category of samples to inspect - return 0 - - # elif "everything else": - # return 1 - # - # this should be all hash-lookup features. - # see below. - - elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)): - # substring and regex features require a full scan of each string - # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count). - # - # TODO: compute the average cost of these feature relative to hash feature - # and adjust the factor accordingly. - return 2 - - elif isinstance(node, (ceng.Not, ceng.Range)): - # the cost of these nodes are defined by the complexity of their single child. - return RuleSet._get_node_cost(node.child) - - elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): - # the cost of these nodes is the full cost of their children - # as this is the worst-case scenario. - return sum(map(RuleSet._get_node_cost, node.children)) - - else: - # this should be all hash-lookup features. - # we give this a arbitrary weight of 1. - # the only thing more "important" than this is checking OS/Arch/Format. - return 1 - - @staticmethod - def _optimize_statement(statement): - # this routine operates in-place - - if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)): - # has .children - statement.children = sorted(statement.children, key=lambda n: -RuleSet._get_node_cost(n)) - return - elif isinstance(statement, (ceng.Not, ceng.Range)): - # has .child - RuleSet._optimize_statement(statement.child) - return - else: - # appears to be "simple" - return - - @staticmethod - def _optimize_rule(rule): - # this routine operates in-place - RuleSet._optimize_statement(rule.statement) - - @staticmethod - def _optimize_rules(rules): - logger.debug("optimizing %d rules", len(rules)) - for rule in rules: - RuleSet._optimize_rule(rule) - return rules From 96813c37b7ac54a0d2a5af5168e705bee0e46f0b Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 13:48:33 -0700 Subject: [PATCH 12/26] remove old improt --- capa/optimizer.py | 70 ----------------------------------------------- capa/rules.py | 3 -- 2 files changed, 73 deletions(-) delete mode 100644 capa/optimizer.py diff --git a/capa/optimizer.py b/capa/optimizer.py deleted file mode 100644 index 462bdf0f..00000000 --- a/capa/optimizer.py +++ /dev/null @@ -1,70 +0,0 @@ -import logging - -import capa.engine as ceng -import capa.features.common - -logger = logging.getLogger(__name__) - - -def get_node_cost(node): - if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)): - # we assume these are the most restrictive features: - # authors commonly use them at the start of rules to restrict the category of samples to inspect - return 0 - - # elif "everything else": - # return 1 - # - # this should be all hash-lookup features. - # see below. - - elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)): - # substring and regex features require a full scan of each string - # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count). - # - # TODO: compute the average cost of these feature relative to hash feature - # and adjust the factor accordingly. - return 2 - - elif isinstance(node, (ceng.Not, ceng.Range)): - # the cost of these nodes are defined by the complexity of their single child. - return get_node_cost(node.child) - - elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): - # the cost of these nodes is the full cost of their children - # as this is the worst-case scenario. - return sum(map(get_node_cost, node.children)) - - else: - # this should be all hash-lookup features. - # we give this a arbitrary weight of 1. - # the only thing more "important" than this is checking OS/Arch/Format. - return 1 - - -def optimize_statement(statement): - # this routine operates in-place - - if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)): - # has .children - statement.children = sorted(statement.children, key=lambda n: -get_node_cost(n)) - return - elif isinstance(statement, (ceng.Not, ceng.Range)): - # has .child - optimize_statement(statement.child) - return - else: - # appears to be "simple" - return - - -def optimize_rule(rule): - # this routine operates in-place - optimize_statement(rule.statement) - - -def optimize_rules(rules): - logger.debug("optimizing %d rules", len(rules)) - for rule in rules: - optimize_rule(rule) - return rules diff --git a/capa/rules.py b/capa/rules.py index fb550ad4..2753f19d 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -30,7 +30,6 @@ import ruamel.yaml import capa.perf import capa.engine as ceng import capa.features -import capa.optimizer import capa.features.file import capa.features.insn import capa.features.common @@ -962,8 +961,6 @@ class RuleSet: if len(rules) == 0: raise InvalidRuleSet("no rules selected") - rules = capa.optimizer.optimize_rules(rules) - self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE) self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE) self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE) From d987719889da426e81c64768caa3f3d0108889b7 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 13:53:37 -0700 Subject: [PATCH 13/26] engine: some: correctly count satisfied children --- capa/engine.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/capa/engine.py b/capa/engine.py index 601ddd34..5d2383c6 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -163,10 +163,14 @@ class Some(Statement): capa.perf.counters["evaluate.feature.some"] += 1 results = [] + satisfied_children_count = 0 for child in self.children: result = child.evaluate(ctx) results.append(result) - if len(results) >= self.count: + if result: + satisfied_children_count += 1 + + if satisfied_children_count >= self.count: # short circuit as soon as we hit the threshold return Result(True, self, results) From 1a8405167930e977344bbd28701070c2c1c566bf Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 14:07:31 -0700 Subject: [PATCH 14/26] changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f0667535..49770247 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ ### New Features +- engine: short circuit logic nodes for better performance #824 @williballenthin + ### Breaking Changes ### New Rules (3) From 9fa9c6a5d099364e12dc2fb6555d4aeff3348e05 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 14:07:44 -0700 Subject: [PATCH 15/26] tests: add test demonstrating short circuiting --- tests/test_engine.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_engine.py b/tests/test_engine.py index ce421759..b130f2f2 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -533,3 +533,21 @@ def test_render_offset(): assert str(capa.features.insn.Offset(1)) == "offset(0x1)" assert str(capa.features.insn.Offset(1, bitness=capa.features.common.BITNESS_X32)) == "offset/x32(0x1)" assert str(capa.features.insn.Offset(1, bitness=capa.features.common.BITNESS_X64)) == "offset/x64(0x1)" + + +def test_short_circuit_order(): + # base cases. + assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True + assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}) == True + + # with short circuiting, only the children up until the first satisfied child are captured. + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children) == 1 + assert len(Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children) == 2 + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}, Number(2): {1}}).children) == 1 + + # and its guaranteed that children are evaluated in order. + assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children[0].statement == Number(1) + assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children[0].statement != Number(2) + + assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children[1].statement == Number(2) + assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children[1].statement != Number(1) From 3e74da96a6f7b437e8b1f563ae2d1b3dc46ffc82 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 14:55:11 -0700 Subject: [PATCH 16/26] engine: make short circuiting configurable --- capa/engine.py | 104 ++++++++++++++++++++++++++-------------- capa/features/common.py | 16 +++++-- capa/rules.py | 4 +- tests/test_engine.py | 10 +++- 4 files changed, 91 insertions(+), 43 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index 5d2383c6..b3a62f46 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -46,9 +46,12 @@ class Statement: def __repr__(self): return str(self) - def evaluate(self, features: FeatureSet) -> Result: + def evaluate(self, features: FeatureSet, short_circuit=True) -> Result: """ classes that inherit `Statement` must implement `evaluate` + + args: + short_circuit (bool): if true, then statements like and/or/some may short circuit. """ raise NotImplementedError() @@ -85,19 +88,24 @@ class And(Statement): super(And, self).__init__(description=description) self.children = children - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.and"] += 1 - results = [] - for child in self.children: - result = child.evaluate(ctx) - results.append(result) - if not result: - # short circuit - return Result(False, self, results) + if short_circuit: + results = [] + for child in self.children: + result = child.evaluate(ctx, short_circuit=short_circuit) + results.append(result) + if not result: + # short circuit + return Result(False, self, results) - return Result(True, self, results) + return Result(True, self, results) + else: + results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children] + success = all(results) + return Result(success, self, results) class Or(Statement): @@ -113,19 +121,24 @@ class Or(Statement): super(Or, self).__init__(description=description) self.children = children - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.or"] += 1 - results = [] - for child in self.children: - result = child.evaluate(ctx) - results.append(result) - if result: - # short circuit as soon as we hit one match - return Result(True, self, results) + if short_circuit: + results = [] + for child in self.children: + result = child.evaluate(ctx, short_circuit=short_circuit) + results.append(result) + if result: + # short circuit as soon as we hit one match + return Result(True, self, results) - return Result(False, self, results) + return Result(False, self, results) + else: + results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children] + success = any(results) + return Result(success, self, results) class Not(Statement): @@ -135,11 +148,11 @@ class Not(Statement): super(Not, self).__init__(description=description) self.child = child - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.not"] += 1 - results = [self.child.evaluate(ctx)] + results = [self.child.evaluate(ctx, short_circuit=short_circuit)] success = not results[0] return Result(success, self, results) @@ -158,23 +171,32 @@ class Some(Statement): self.count = count self.children = children - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.some"] += 1 - results = [] - satisfied_children_count = 0 - for child in self.children: - result = child.evaluate(ctx) - results.append(result) - if result: - satisfied_children_count += 1 + if short_circuit: + results = [] + satisfied_children_count = 0 + for child in self.children: + result = child.evaluate(ctx, short_circuit=short_circuit) + results.append(result) + if result: + satisfied_children_count += 1 - if satisfied_children_count >= self.count: - # short circuit as soon as we hit the threshold - return Result(True, self, results) + if satisfied_children_count >= self.count: + # short circuit as soon as we hit the threshold + return Result(True, self, results) - return Result(False, self, results) + return Result(False, self, results) + else: + results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children] + # note that here we cast the child result as a bool + # because we've overridden `__bool__` above. + # + # we can't use `if child is True` because the instance is not True. + success = sum([1 for child in results if bool(child) is True]) >= self.count + return Result(success, self, results) class Range(Statement): @@ -186,7 +208,7 @@ class Range(Statement): self.min = min if min is not None else 0 self.max = max if max is not None else (1 << 64 - 1) - def evaluate(self, ctx): + def evaluate(self, ctx, **kwargs): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.range"] += 1 @@ -214,7 +236,7 @@ class Subscope(Statement): self.scope = scope self.child = child - def evaluate(self, ctx): + def evaluate(self, ctx, **kwargs): raise ValueError("cannot evaluate a subscope directly!") @@ -272,8 +294,18 @@ def match(rules: List["capa.rules.Rule"], features: FeatureSet, va: int) -> Tupl features = collections.defaultdict(set, copy.copy(features)) for rule in rules: - res = rule.evaluate(features) + res = rule.evaluate(features, short_circuit=True) if res: + # we first matched the rule with short circuiting enabled. + # this is much faster than without short circuiting. + # however, we want to collect all results thoroughly, + # so once we've found a match quickly, + # go back and capture results without short circuiting. + res = rule.evaluate(features, short_circuit=False) + + # sanity check + assert bool(res) is True + results[rule.name].append((va, res)) # we need to update the current `features` # because subsequent iterations of this loop may use newly added features, diff --git a/capa/features/common.py b/capa/features/common.py index 3a4e71e9..6b867766 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -146,7 +146,7 @@ class Feature: def __repr__(self): return str(self) - def evaluate(self, ctx: Dict["Feature", Set[int]]) -> Result: + def evaluate(self, ctx: Dict["Feature", Set[int]], **kwargs) -> Result: capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature." + self.name] += 1 return Result(self in ctx, self, [], locations=ctx.get(self, [])) @@ -192,7 +192,7 @@ class Substring(String): super(Substring, self).__init__(value, description=description) self.value = value - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.substring"] += 1 @@ -210,6 +210,10 @@ class Substring(String): if self.value in feature.value: matches[feature.value].extend(locations) + if short_circuit: + # we found one matching string, thats sufficient to match. + # don't collect other matching strings in this mode. + break if matches: # finalize: defaultdict -> dict @@ -280,7 +284,7 @@ class Regex(String): "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % value ) - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.regex"] += 1 @@ -302,6 +306,10 @@ class Regex(String): # so that they don't have to prefix/suffix their terms like: /.*foo.*/. if self.re.search(feature.value): matches[feature.value].extend(locations) + if short_circuit: + # we found one matching string, thats sufficient to match. + # don't collect other matching strings in this mode. + break if matches: # finalize: defaultdict -> dict @@ -366,7 +374,7 @@ class Bytes(Feature): super(Bytes, self).__init__(value, description=description) self.value = value - def evaluate(self, ctx): + def evaluate(self, ctx, **kwargs): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.bytes"] += 1 diff --git a/capa/rules.py b/capa/rules.py index 2753f19d..00dc0837 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -620,10 +620,10 @@ class Rule: for new_rule in self._extract_subscope_rules_rec(self.statement): yield new_rule - def evaluate(self, features: FeatureSet): + def evaluate(self, features: FeatureSet, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.rule"] += 1 - return self.statement.evaluate(features) + return self.statement.evaluate(features, short_circuit=short_circuit) @classmethod def from_dict(cls, d, definition): diff --git a/tests/test_engine.py b/tests/test_engine.py index b130f2f2..b07c89e6 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -535,7 +535,15 @@ def test_render_offset(): assert str(capa.features.insn.Offset(1, bitness=capa.features.common.BITNESS_X64)) == "offset/x64(0x1)" -def test_short_circuit_order(): +def test_short_circuit(): + assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True + + # with short circuiting, only the children up until the first satisfied child are captured. + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}, short_circuit=True).children) == 1 + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}, short_circuit=False).children) == 2 + + +def test_eval_order(): # base cases. assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}) == True From 0b517c51d87634d72db8c67c0028673709bc3777 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 15:22:01 -0700 Subject: [PATCH 17/26] main: remove perf messages --- capa/main.py | 54 +++++++++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/capa/main.py b/capa/main.py index 5aaa97d6..52c28d27 100644 --- a/capa/main.py +++ b/capa/main.py @@ -169,28 +169,27 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro n_funcs = len(functions) pb = pbar(functions, desc="matching", unit=" functions", postfix="skipped 0 library functions") - with timing("match functions"): - for f in pb: - function_address = int(f) + for f in pb: + function_address = int(f) - if extractor.is_library_function(function_address): - function_name = extractor.get_function_name(function_address) - logger.debug("skipping library function 0x%x (%s)", function_address, function_name) - meta["library_functions"][function_address] = function_name - n_libs = len(meta["library_functions"]) - percentage = 100 * (n_libs / n_funcs) - if isinstance(pb, tqdm.tqdm): - pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage)) - continue + if extractor.is_library_function(function_address): + function_name = extractor.get_function_name(function_address) + logger.debug("skipping library function 0x%x (%s)", function_address, function_name) + meta["library_functions"][function_address] = function_name + n_libs = len(meta["library_functions"]) + percentage = 100 * (n_libs / n_funcs) + if isinstance(pb, tqdm.tqdm): + pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage)) + continue - function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f) - meta["feature_counts"]["functions"][function_address] = feature_count - logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count) + function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f) + meta["feature_counts"]["functions"][function_address] = feature_count + logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count) - for rule_name, res in function_matches.items(): - all_function_matches[rule_name].extend(res) - for rule_name, res in bb_matches.items(): - all_bb_matches[rule_name].extend(res) + for rule_name, res in function_matches.items(): + all_function_matches[rule_name].extend(res) + for rule_name, res in bb_matches.items(): + all_bb_matches[rule_name].extend(res) # collection of features that captures the rule matches within function and BB scopes. # mapping from feature (matched rule) to set of addresses at which it matched. @@ -200,8 +199,7 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro rule = ruleset[rule_name] capa.engine.index_rule_matches(function_and_lower_features, rule, locations) - with timing("match file"): - all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features) + all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features) meta["feature_counts"]["file"] = feature_count matches = { @@ -413,11 +411,9 @@ def get_workspace(path, format, sigpaths): else: raise ValueError("unexpected format: " + format) - with timing("load FLIRT"): - viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths) + viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths) - with timing("viv analyze"): - vw.analyze() + vw.analyze() logger.debug("%s", get_meta_str(vw)) return vw @@ -905,9 +901,8 @@ def main(argv=None): return E_MISSING_FILE try: - with timing("load rules"): - rules = get_rules(args.rules, disable_progress=args.quiet) - rules = capa.rules.RuleSet(rules) + rules = get_rules(args.rules, disable_progress=args.quiet) + rules = capa.rules.RuleSet(rules) logger.debug( "successfully loaded %s rules", @@ -1020,8 +1015,7 @@ def main(argv=None): meta = collect_metadata(argv, args.sample, args.rules, extractor) - with timing("find capabilities"): - capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) + capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) meta["analysis"].update(counts) meta["analysis"]["layout"] = compute_layout(rules, extractor, capabilities) From 152d0f32443f8b1669173d61b351f81c24f4c9fb Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 15:34:59 -0700 Subject: [PATCH 18/26] ruleset: add query optimizer --- capa/optimizer.py | 70 +++++++++++++++++++++++++++++++++++++++++++++++ capa/rules.py | 3 ++ 2 files changed, 73 insertions(+) create mode 100644 capa/optimizer.py diff --git a/capa/optimizer.py b/capa/optimizer.py new file mode 100644 index 00000000..462bdf0f --- /dev/null +++ b/capa/optimizer.py @@ -0,0 +1,70 @@ +import logging + +import capa.engine as ceng +import capa.features.common + +logger = logging.getLogger(__name__) + + +def get_node_cost(node): + if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)): + # we assume these are the most restrictive features: + # authors commonly use them at the start of rules to restrict the category of samples to inspect + return 0 + + # elif "everything else": + # return 1 + # + # this should be all hash-lookup features. + # see below. + + elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)): + # substring and regex features require a full scan of each string + # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count). + # + # TODO: compute the average cost of these feature relative to hash feature + # and adjust the factor accordingly. + return 2 + + elif isinstance(node, (ceng.Not, ceng.Range)): + # the cost of these nodes are defined by the complexity of their single child. + return get_node_cost(node.child) + + elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): + # the cost of these nodes is the full cost of their children + # as this is the worst-case scenario. + return sum(map(get_node_cost, node.children)) + + else: + # this should be all hash-lookup features. + # we give this a arbitrary weight of 1. + # the only thing more "important" than this is checking OS/Arch/Format. + return 1 + + +def optimize_statement(statement): + # this routine operates in-place + + if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)): + # has .children + statement.children = sorted(statement.children, key=lambda n: -get_node_cost(n)) + return + elif isinstance(statement, (ceng.Not, ceng.Range)): + # has .child + optimize_statement(statement.child) + return + else: + # appears to be "simple" + return + + +def optimize_rule(rule): + # this routine operates in-place + optimize_statement(rule.statement) + + +def optimize_rules(rules): + logger.debug("optimizing %d rules", len(rules)) + for rule in rules: + optimize_rule(rule) + return rules diff --git a/capa/rules.py b/capa/rules.py index 00dc0837..2d53a0aa 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -30,6 +30,7 @@ import ruamel.yaml import capa.perf import capa.engine as ceng import capa.features +import capa.optimizer import capa.features.file import capa.features.insn import capa.features.common @@ -961,6 +962,8 @@ class RuleSet: if len(rules) == 0: raise InvalidRuleSet("no rules selected") + rules = capa.optimizer.optimize_rules(rules) + self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE) self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE) self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE) From e287dc9a32bef24ceaab907076d5377d6c892e18 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 15:54:14 -0700 Subject: [PATCH 19/26] optimizer: fix sort order --- capa/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/optimizer.py b/capa/optimizer.py index 462bdf0f..9d14c6e6 100644 --- a/capa/optimizer.py +++ b/capa/optimizer.py @@ -47,7 +47,7 @@ def optimize_statement(statement): if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)): # has .children - statement.children = sorted(statement.children, key=lambda n: -get_node_cost(n)) + statement.children = sorted(statement.children, key=lambda n: get_node_cost(n)) return elif isinstance(statement, (ceng.Not, ceng.Range)): # has .child From 6909d6a54169879b04eb089d1e603adf6c50385c Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 16:04:15 -0700 Subject: [PATCH 20/26] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 665c3c1d..ccc9840d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### New Features - engine: short circuit logic nodes for better performance #824 @williballenthin +- engine: add optimizer the order faster nodes first #829 @williballenthin ### Breaking Changes From a68812b223a6bb74612c16ace2f2e95072db7ab2 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 9 Nov 2021 10:48:54 -0700 Subject: [PATCH 21/26] Update capa/engine.py Co-authored-by: Moritz --- capa/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/engine.py b/capa/engine.py index b3a62f46..a690db90 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -79,7 +79,7 @@ class And(Statement): """ match if all of the children evaluate to True. - the order of evaluation is dicated by the property + the order of evaluation is dictated by the property `And.children` (type: List[Statement|Feature]). a query optimizer may safely manipulate the order of these children. """ From 51af2d4a561378f6c903c8da536c8c5bfdb6e5fd Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 9 Nov 2021 10:49:01 -0700 Subject: [PATCH 22/26] Update capa/engine.py Co-authored-by: Moritz --- capa/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/engine.py b/capa/engine.py index a690db90..a9076c25 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -112,7 +112,7 @@ class Or(Statement): """ match if any of the children evaluate to True. - the order of evaluation is dicated by the property + the order of evaluation is dictated by the property `Or.children` (type: List[Statement|Feature]). a query optimizer may safely manipulate the order of these children. """ From f427c5e9618bf0782083426dae2a1a2c44f3a778 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 9 Nov 2021 10:49:10 -0700 Subject: [PATCH 23/26] Update capa/engine.py Co-authored-by: Moritz --- capa/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/engine.py b/capa/engine.py index a9076c25..29c0dc65 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -161,7 +161,7 @@ class Some(Statement): """ match if at least N of the children evaluate to True. - the order of evaluation is dicated by the property + the order of evaluation is dictated by the property `Some.children` (type: List[Statement|Feature]). a query optimizer may safely manipulate the order of these children. """ From 7a4aee592be171c7ba7fde03623a1596db831b9e Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Tue, 9 Nov 2021 16:08:39 -0700 Subject: [PATCH 24/26] profile-time: add doc --- scripts/profile-time.py | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/scripts/profile-time.py b/scripts/profile-time.py index 3c47b67b..3d6b5e07 100644 --- a/scripts/profile-time.py +++ b/scripts/profile-time.py @@ -1,3 +1,34 @@ +""" +Invoke capa multiple times and record profiling informations. +Use the --number and --repeat options to change the number of iterations. +By default, the script will emit a markdown table with a label pulled from git. + +Note: you can run this script against pre-generated .frz files to reduce the startup time. + +usage: + + usage: profile-time.py [--number NUMBER] [--repeat REPEAT] [--label LABEL] sample + + Profile capa performance + + positional arguments: + sample path to sample to analyze + + optional arguments: + --number NUMBER batch size of profile collection + --repeat REPEAT batch count of profile collection + --label LABEL description of the profile collection + +example: + + $ python profile-time.py ./tests/data/kernel32.dll_.frz --number 1 --repeat 2 + + | label | count(evaluations) | avg(time) | min(time) | max(time) | + |--------------------------------------|----------------------|-------------|-------------|-------------| + | 18c30e4 main: remove perf debug msgs | 66,561,622 | 132.13s | 125.14s | 139.12s | + + ^^^ --label or git hash +""" import sys import timeit import logging @@ -98,12 +129,16 @@ def main(argv=None): ( args.label, "{:,}".format(capa.perf.counters["evaluate.feature"]), - "%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)), + # python documentation indicates that min(samples) should be preferred, + # so lets put that first. + # + # https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat "%0.2fs" % (min(samples) / float(args.number)), + "%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)), "%0.2fs" % (max(samples) / float(args.number)), ) ], - headers=["label", "count(evaluations)", "avg(time)", "min(time)", "max(time)"], + headers=["label", "count(evaluations)", "min(time)", "avg(time)", "max(time)"], tablefmt="github", ) ) From ea386d02b68b6864586e433b8463a8060e220b85 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Tue, 9 Nov 2021 16:24:26 -0700 Subject: [PATCH 25/26] tests: add test demonstrating optimizer --- tests/test_optimizer.py | 65 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 tests/test_optimizer.py diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py new file mode 100644 index 00000000..69a79bd6 --- /dev/null +++ b/tests/test_optimizer.py @@ -0,0 +1,65 @@ +# Copyright (C) 2021 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import textwrap + +import pytest + +import capa.rules +import capa.engine +import capa.optimizer +import capa.features.common +from capa.engine import Or, And +from capa.features.insn import Mnemonic +from capa.features.common import Arch, Bytes, Substring + + +def test_optimizer_order(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scope: function + features: + - and: + - substring: "foo" + - arch: amd64 + - mnemonic: cmp + - and: + - bytes: 3 + - offset: 2 + - or: + - number: 1 + - offset: 4 + """ + ) + r = capa.rules.Rule.from_yaml(rule) + + # before optimization + children = list(r.statement.get_children()) + assert isinstance(children[0], Substring) + assert isinstance(children[1], Arch) + assert isinstance(children[2], Mnemonic) + assert isinstance(children[3], And) + assert isinstance(children[4], Or) + + # after optimization + capa.optimizer.optimize_rules([r]) + children = list(r.statement.get_children()) + + # cost: 0 + assert isinstance(children[0], Arch) + # cost: 1 + assert isinstance(children[1], Mnemonic) + # cost: 2 + assert isinstance(children[2], Substring) + # cost: 3 + assert isinstance(children[3], Or) + # cost: 4 + assert isinstance(children[4], And) From d4d801c246940904f959ad14dbdaefde51a672eb Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Tue, 9 Nov 2021 16:26:26 -0700 Subject: [PATCH 26/26] optimizer: tweak costs slightly --- capa/optimizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/capa/optimizer.py b/capa/optimizer.py index 9d14c6e6..0408bf07 100644 --- a/capa/optimizer.py +++ b/capa/optimizer.py @@ -18,7 +18,7 @@ def get_node_cost(node): # this should be all hash-lookup features. # see below. - elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)): + elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes)): # substring and regex features require a full scan of each string # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count). # @@ -28,12 +28,12 @@ def get_node_cost(node): elif isinstance(node, (ceng.Not, ceng.Range)): # the cost of these nodes are defined by the complexity of their single child. - return get_node_cost(node.child) + return 1 + get_node_cost(node.child) elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): # the cost of these nodes is the full cost of their children # as this is the worst-case scenario. - return sum(map(get_node_cost, node.children)) + return 1 + sum(map(get_node_cost, node.children)) else: # this should be all hash-lookup features.