diff --git a/CHANGELOG.md b/CHANGELOG.md index b035aaf8..47a84ebe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ ### New Features +- engine: short circuit logic nodes for better performance #824 @williballenthin +- engine: add optimizer the order faster nodes first #829 @williballenthin - engine: optimize rule evaluation by skipping rules that can't match #830 @williballenthin ### Breaking Changes diff --git a/capa/engine.py b/capa/engine.py index 9591a2c5..2dbb3574 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -46,9 +46,12 @@ class Statement: def __repr__(self): return str(self) - def evaluate(self, features: FeatureSet) -> Result: + def evaluate(self, features: FeatureSet, short_circuit=True) -> Result: """ classes that inherit `Statement` must implement `evaluate` + + args: + short_circuit (bool): if true, then statements like and/or/some may short circuit. """ raise NotImplementedError() @@ -73,35 +76,69 @@ class Statement: class And(Statement): - """match if all of the children evaluate to True.""" + """ + match if all of the children evaluate to True. + + the order of evaluation is dictated by the property + `And.children` (type: List[Statement|Feature]). + a query optimizer may safely manipulate the order of these children. + """ def __init__(self, children, description=None): super(And, self).__init__(description=description) self.children = children - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.and"] += 1 - results = [child.evaluate(ctx) for child in self.children] - success = all(results) - return Result(success, self, results) + if short_circuit: + results = [] + for child in self.children: + result = child.evaluate(ctx, short_circuit=short_circuit) + results.append(result) + if not result: + # short circuit + return Result(False, self, results) + + return Result(True, self, results) + else: + results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children] + success = all(results) + return Result(success, self, results) class Or(Statement): - """match if any of the children evaluate to True.""" + """ + match if any of the children evaluate to True. + + the order of evaluation is dictated by the property + `Or.children` (type: List[Statement|Feature]). + a query optimizer may safely manipulate the order of these children. + """ def __init__(self, children, description=None): super(Or, self).__init__(description=description) self.children = children - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.or"] += 1 - results = [child.evaluate(ctx) for child in self.children] - success = any(results) - return Result(success, self, results) + if short_circuit: + results = [] + for child in self.children: + result = child.evaluate(ctx, short_circuit=short_circuit) + results.append(result) + if result: + # short circuit as soon as we hit one match + return Result(True, self, results) + + return Result(False, self, results) + else: + results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children] + success = any(results) + return Result(success, self, results) class Not(Statement): @@ -111,34 +148,55 @@ class Not(Statement): super(Not, self).__init__(description=description) self.child = child - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.not"] += 1 - results = [self.child.evaluate(ctx)] + results = [self.child.evaluate(ctx, short_circuit=short_circuit)] success = not results[0] return Result(success, self, results) class Some(Statement): - """match if at least N of the children evaluate to True.""" + """ + match if at least N of the children evaluate to True. + + the order of evaluation is dictated by the property + `Some.children` (type: List[Statement|Feature]). + a query optimizer may safely manipulate the order of these children. + """ def __init__(self, count, children, description=None): super(Some, self).__init__(description=description) self.count = count self.children = children - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.some"] += 1 - results = [child.evaluate(ctx) for child in self.children] - # note that here we cast the child result as a bool - # because we've overridden `__bool__` above. - # - # we can't use `if child is True` because the instance is not True. - success = sum([1 for child in results if bool(child) is True]) >= self.count - return Result(success, self, results) + if short_circuit: + results = [] + satisfied_children_count = 0 + for child in self.children: + result = child.evaluate(ctx, short_circuit=short_circuit) + results.append(result) + if result: + satisfied_children_count += 1 + + if satisfied_children_count >= self.count: + # short circuit as soon as we hit the threshold + return Result(True, self, results) + + return Result(False, self, results) + else: + results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children] + # note that here we cast the child result as a bool + # because we've overridden `__bool__` above. + # + # we can't use `if child is True` because the instance is not True. + success = sum([1 for child in results if bool(child) is True]) >= self.count + return Result(success, self, results) class Range(Statement): @@ -150,7 +208,7 @@ class Range(Statement): self.min = min if min is not None else 0 self.max = max if max is not None else (1 << 64 - 1) - def evaluate(self, ctx): + def evaluate(self, ctx, **kwargs): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.range"] += 1 @@ -178,7 +236,7 @@ class Subscope(Statement): self.scope = scope self.child = child - def evaluate(self, ctx): + def evaluate(self, ctx, **kwargs): raise ValueError("cannot evaluate a subscope directly!") @@ -241,8 +299,18 @@ def match(rules: List["capa.rules.Rule"], features: FeatureSet, va: int) -> Tupl features = collections.defaultdict(set, copy.copy(features)) for rule in rules: - res = rule.evaluate(features) + res = rule.evaluate(features, short_circuit=True) if res: + # we first matched the rule with short circuiting enabled. + # this is much faster than without short circuiting. + # however, we want to collect all results thoroughly, + # so once we've found a match quickly, + # go back and capture results without short circuiting. + res = rule.evaluate(features, short_circuit=False) + + # sanity check + assert bool(res) is True + results[rule.name].append((va, res)) # we need to update the current `features` # because subsequent iterations of this loop may use newly added features, diff --git a/capa/features/common.py b/capa/features/common.py index 3a4e71e9..6b867766 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -146,7 +146,7 @@ class Feature: def __repr__(self): return str(self) - def evaluate(self, ctx: Dict["Feature", Set[int]]) -> Result: + def evaluate(self, ctx: Dict["Feature", Set[int]], **kwargs) -> Result: capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature." + self.name] += 1 return Result(self in ctx, self, [], locations=ctx.get(self, [])) @@ -192,7 +192,7 @@ class Substring(String): super(Substring, self).__init__(value, description=description) self.value = value - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.substring"] += 1 @@ -210,6 +210,10 @@ class Substring(String): if self.value in feature.value: matches[feature.value].extend(locations) + if short_circuit: + # we found one matching string, thats sufficient to match. + # don't collect other matching strings in this mode. + break if matches: # finalize: defaultdict -> dict @@ -280,7 +284,7 @@ class Regex(String): "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % value ) - def evaluate(self, ctx): + def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.regex"] += 1 @@ -302,6 +306,10 @@ class Regex(String): # so that they don't have to prefix/suffix their terms like: /.*foo.*/. if self.re.search(feature.value): matches[feature.value].extend(locations) + if short_circuit: + # we found one matching string, thats sufficient to match. + # don't collect other matching strings in this mode. + break if matches: # finalize: defaultdict -> dict @@ -366,7 +374,7 @@ class Bytes(Feature): super(Bytes, self).__init__(value, description=description) self.value = value - def evaluate(self, ctx): + def evaluate(self, ctx, **kwargs): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.bytes"] += 1 diff --git a/capa/optimizer.py b/capa/optimizer.py new file mode 100644 index 00000000..0408bf07 --- /dev/null +++ b/capa/optimizer.py @@ -0,0 +1,70 @@ +import logging + +import capa.engine as ceng +import capa.features.common + +logger = logging.getLogger(__name__) + + +def get_node_cost(node): + if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)): + # we assume these are the most restrictive features: + # authors commonly use them at the start of rules to restrict the category of samples to inspect + return 0 + + # elif "everything else": + # return 1 + # + # this should be all hash-lookup features. + # see below. + + elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes)): + # substring and regex features require a full scan of each string + # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count). + # + # TODO: compute the average cost of these feature relative to hash feature + # and adjust the factor accordingly. + return 2 + + elif isinstance(node, (ceng.Not, ceng.Range)): + # the cost of these nodes are defined by the complexity of their single child. + return 1 + get_node_cost(node.child) + + elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): + # the cost of these nodes is the full cost of their children + # as this is the worst-case scenario. + return 1 + sum(map(get_node_cost, node.children)) + + else: + # this should be all hash-lookup features. + # we give this a arbitrary weight of 1. + # the only thing more "important" than this is checking OS/Arch/Format. + return 1 + + +def optimize_statement(statement): + # this routine operates in-place + + if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)): + # has .children + statement.children = sorted(statement.children, key=lambda n: get_node_cost(n)) + return + elif isinstance(statement, (ceng.Not, ceng.Range)): + # has .child + optimize_statement(statement.child) + return + else: + # appears to be "simple" + return + + +def optimize_rule(rule): + # this routine operates in-place + optimize_statement(rule.statement) + + +def optimize_rules(rules): + logger.debug("optimizing %d rules", len(rules)) + for rule in rules: + optimize_rule(rule) + return rules diff --git a/capa/rules.py b/capa/rules.py index 3646117a..0eaaeaee 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -31,6 +31,7 @@ import ruamel.yaml import capa.perf import capa.engine as ceng import capa.features +import capa.optimizer import capa.features.file import capa.features.insn import capa.features.common @@ -627,10 +628,10 @@ class Rule: for new_rule in self._extract_subscope_rules_rec(self.statement): yield new_rule - def evaluate(self, features: FeatureSet): + def evaluate(self, features: FeatureSet, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.rule"] += 1 - return self.statement.evaluate(features) + return self.statement.evaluate(features, short_circuit=short_circuit) @classmethod def from_dict(cls, d, definition): @@ -968,6 +969,8 @@ class RuleSet: if len(rules) == 0: raise InvalidRuleSet("no rules selected") + rules = capa.optimizer.optimize_rules(rules) + self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE) self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE) self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE) diff --git a/scripts/profile-time.py b/scripts/profile-time.py index 3c47b67b..3d6b5e07 100644 --- a/scripts/profile-time.py +++ b/scripts/profile-time.py @@ -1,3 +1,34 @@ +""" +Invoke capa multiple times and record profiling informations. +Use the --number and --repeat options to change the number of iterations. +By default, the script will emit a markdown table with a label pulled from git. + +Note: you can run this script against pre-generated .frz files to reduce the startup time. + +usage: + + usage: profile-time.py [--number NUMBER] [--repeat REPEAT] [--label LABEL] sample + + Profile capa performance + + positional arguments: + sample path to sample to analyze + + optional arguments: + --number NUMBER batch size of profile collection + --repeat REPEAT batch count of profile collection + --label LABEL description of the profile collection + +example: + + $ python profile-time.py ./tests/data/kernel32.dll_.frz --number 1 --repeat 2 + + | label | count(evaluations) | avg(time) | min(time) | max(time) | + |--------------------------------------|----------------------|-------------|-------------|-------------| + | 18c30e4 main: remove perf debug msgs | 66,561,622 | 132.13s | 125.14s | 139.12s | + + ^^^ --label or git hash +""" import sys import timeit import logging @@ -98,12 +129,16 @@ def main(argv=None): ( args.label, "{:,}".format(capa.perf.counters["evaluate.feature"]), - "%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)), + # python documentation indicates that min(samples) should be preferred, + # so lets put that first. + # + # https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat "%0.2fs" % (min(samples) / float(args.number)), + "%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)), "%0.2fs" % (max(samples) / float(args.number)), ) ], - headers=["label", "count(evaluations)", "avg(time)", "min(time)", "max(time)"], + headers=["label", "count(evaluations)", "min(time)", "avg(time)", "max(time)"], tablefmt="github", ) ) diff --git a/tests/test_engine.py b/tests/test_engine.py index ce421759..b07c89e6 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -533,3 +533,29 @@ def test_render_offset(): assert str(capa.features.insn.Offset(1)) == "offset(0x1)" assert str(capa.features.insn.Offset(1, bitness=capa.features.common.BITNESS_X32)) == "offset/x32(0x1)" assert str(capa.features.insn.Offset(1, bitness=capa.features.common.BITNESS_X64)) == "offset/x64(0x1)" + + +def test_short_circuit(): + assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True + + # with short circuiting, only the children up until the first satisfied child are captured. + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}, short_circuit=True).children) == 1 + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}, short_circuit=False).children) == 2 + + +def test_eval_order(): + # base cases. + assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True + assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}) == True + + # with short circuiting, only the children up until the first satisfied child are captured. + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children) == 1 + assert len(Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children) == 2 + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}, Number(2): {1}}).children) == 1 + + # and its guaranteed that children are evaluated in order. + assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children[0].statement == Number(1) + assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children[0].statement != Number(2) + + assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children[1].statement == Number(2) + assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children[1].statement != Number(1) diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py new file mode 100644 index 00000000..69a79bd6 --- /dev/null +++ b/tests/test_optimizer.py @@ -0,0 +1,65 @@ +# Copyright (C) 2021 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import textwrap + +import pytest + +import capa.rules +import capa.engine +import capa.optimizer +import capa.features.common +from capa.engine import Or, And +from capa.features.insn import Mnemonic +from capa.features.common import Arch, Bytes, Substring + + +def test_optimizer_order(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scope: function + features: + - and: + - substring: "foo" + - arch: amd64 + - mnemonic: cmp + - and: + - bytes: 3 + - offset: 2 + - or: + - number: 1 + - offset: 4 + """ + ) + r = capa.rules.Rule.from_yaml(rule) + + # before optimization + children = list(r.statement.get_children()) + assert isinstance(children[0], Substring) + assert isinstance(children[1], Arch) + assert isinstance(children[2], Mnemonic) + assert isinstance(children[3], And) + assert isinstance(children[4], Or) + + # after optimization + capa.optimizer.optimize_rules([r]) + children = list(r.statement.get_children()) + + # cost: 0 + assert isinstance(children[0], Arch) + # cost: 1 + assert isinstance(children[1], Mnemonic) + # cost: 2 + assert isinstance(children[2], Substring) + # cost: 3 + assert isinstance(children[3], Or) + # cost: 4 + assert isinstance(children[4], And)