mirror of
https://github.com/mandiant/capa.git
synced 2026-06-27 17:03:06 -07:00
Merge branch 'master' into perf/rule-selection
This commit is contained in:
@@ -4,6 +4,8 @@
|
||||
|
||||
### New Features
|
||||
|
||||
- engine: short circuit logic nodes for better performance #824 @williballenthin
|
||||
- engine: add optimizer the order faster nodes first #829 @williballenthin
|
||||
- engine: optimize rule evaluation by skipping rules that can't match #830 @williballenthin
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
+93
-25
@@ -46,9 +46,12 @@ class Statement:
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
def evaluate(self, features: FeatureSet) -> Result:
|
||||
def evaluate(self, features: FeatureSet, short_circuit=True) -> Result:
|
||||
"""
|
||||
classes that inherit `Statement` must implement `evaluate`
|
||||
|
||||
args:
|
||||
short_circuit (bool): if true, then statements like and/or/some may short circuit.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@@ -73,35 +76,69 @@ class Statement:
|
||||
|
||||
|
||||
class And(Statement):
|
||||
"""match if all of the children evaluate to True."""
|
||||
"""
|
||||
match if all of the children evaluate to True.
|
||||
|
||||
the order of evaluation is dictated by the property
|
||||
`And.children` (type: List[Statement|Feature]).
|
||||
a query optimizer may safely manipulate the order of these children.
|
||||
"""
|
||||
|
||||
def __init__(self, children, description=None):
|
||||
super(And, self).__init__(description=description)
|
||||
self.children = children
|
||||
|
||||
def evaluate(self, ctx):
|
||||
def evaluate(self, ctx, short_circuit=True):
|
||||
capa.perf.counters["evaluate.feature"] += 1
|
||||
capa.perf.counters["evaluate.feature.and"] += 1
|
||||
|
||||
results = [child.evaluate(ctx) for child in self.children]
|
||||
success = all(results)
|
||||
return Result(success, self, results)
|
||||
if short_circuit:
|
||||
results = []
|
||||
for child in self.children:
|
||||
result = child.evaluate(ctx, short_circuit=short_circuit)
|
||||
results.append(result)
|
||||
if not result:
|
||||
# short circuit
|
||||
return Result(False, self, results)
|
||||
|
||||
return Result(True, self, results)
|
||||
else:
|
||||
results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children]
|
||||
success = all(results)
|
||||
return Result(success, self, results)
|
||||
|
||||
|
||||
class Or(Statement):
|
||||
"""match if any of the children evaluate to True."""
|
||||
"""
|
||||
match if any of the children evaluate to True.
|
||||
|
||||
the order of evaluation is dictated by the property
|
||||
`Or.children` (type: List[Statement|Feature]).
|
||||
a query optimizer may safely manipulate the order of these children.
|
||||
"""
|
||||
|
||||
def __init__(self, children, description=None):
|
||||
super(Or, self).__init__(description=description)
|
||||
self.children = children
|
||||
|
||||
def evaluate(self, ctx):
|
||||
def evaluate(self, ctx, short_circuit=True):
|
||||
capa.perf.counters["evaluate.feature"] += 1
|
||||
capa.perf.counters["evaluate.feature.or"] += 1
|
||||
|
||||
results = [child.evaluate(ctx) for child in self.children]
|
||||
success = any(results)
|
||||
return Result(success, self, results)
|
||||
if short_circuit:
|
||||
results = []
|
||||
for child in self.children:
|
||||
result = child.evaluate(ctx, short_circuit=short_circuit)
|
||||
results.append(result)
|
||||
if result:
|
||||
# short circuit as soon as we hit one match
|
||||
return Result(True, self, results)
|
||||
|
||||
return Result(False, self, results)
|
||||
else:
|
||||
results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children]
|
||||
success = any(results)
|
||||
return Result(success, self, results)
|
||||
|
||||
|
||||
class Not(Statement):
|
||||
@@ -111,34 +148,55 @@ class Not(Statement):
|
||||
super(Not, self).__init__(description=description)
|
||||
self.child = child
|
||||
|
||||
def evaluate(self, ctx):
|
||||
def evaluate(self, ctx, short_circuit=True):
|
||||
capa.perf.counters["evaluate.feature"] += 1
|
||||
capa.perf.counters["evaluate.feature.not"] += 1
|
||||
|
||||
results = [self.child.evaluate(ctx)]
|
||||
results = [self.child.evaluate(ctx, short_circuit=short_circuit)]
|
||||
success = not results[0]
|
||||
return Result(success, self, results)
|
||||
|
||||
|
||||
class Some(Statement):
|
||||
"""match if at least N of the children evaluate to True."""
|
||||
"""
|
||||
match if at least N of the children evaluate to True.
|
||||
|
||||
the order of evaluation is dictated by the property
|
||||
`Some.children` (type: List[Statement|Feature]).
|
||||
a query optimizer may safely manipulate the order of these children.
|
||||
"""
|
||||
|
||||
def __init__(self, count, children, description=None):
|
||||
super(Some, self).__init__(description=description)
|
||||
self.count = count
|
||||
self.children = children
|
||||
|
||||
def evaluate(self, ctx):
|
||||
def evaluate(self, ctx, short_circuit=True):
|
||||
capa.perf.counters["evaluate.feature"] += 1
|
||||
capa.perf.counters["evaluate.feature.some"] += 1
|
||||
|
||||
results = [child.evaluate(ctx) for child in self.children]
|
||||
# note that here we cast the child result as a bool
|
||||
# because we've overridden `__bool__` above.
|
||||
#
|
||||
# we can't use `if child is True` because the instance is not True.
|
||||
success = sum([1 for child in results if bool(child) is True]) >= self.count
|
||||
return Result(success, self, results)
|
||||
if short_circuit:
|
||||
results = []
|
||||
satisfied_children_count = 0
|
||||
for child in self.children:
|
||||
result = child.evaluate(ctx, short_circuit=short_circuit)
|
||||
results.append(result)
|
||||
if result:
|
||||
satisfied_children_count += 1
|
||||
|
||||
if satisfied_children_count >= self.count:
|
||||
# short circuit as soon as we hit the threshold
|
||||
return Result(True, self, results)
|
||||
|
||||
return Result(False, self, results)
|
||||
else:
|
||||
results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children]
|
||||
# note that here we cast the child result as a bool
|
||||
# because we've overridden `__bool__` above.
|
||||
#
|
||||
# we can't use `if child is True` because the instance is not True.
|
||||
success = sum([1 for child in results if bool(child) is True]) >= self.count
|
||||
return Result(success, self, results)
|
||||
|
||||
|
||||
class Range(Statement):
|
||||
@@ -150,7 +208,7 @@ class Range(Statement):
|
||||
self.min = min if min is not None else 0
|
||||
self.max = max if max is not None else (1 << 64 - 1)
|
||||
|
||||
def evaluate(self, ctx):
|
||||
def evaluate(self, ctx, **kwargs):
|
||||
capa.perf.counters["evaluate.feature"] += 1
|
||||
capa.perf.counters["evaluate.feature.range"] += 1
|
||||
|
||||
@@ -178,7 +236,7 @@ class Subscope(Statement):
|
||||
self.scope = scope
|
||||
self.child = child
|
||||
|
||||
def evaluate(self, ctx):
|
||||
def evaluate(self, ctx, **kwargs):
|
||||
raise ValueError("cannot evaluate a subscope directly!")
|
||||
|
||||
|
||||
@@ -241,8 +299,18 @@ def match(rules: List["capa.rules.Rule"], features: FeatureSet, va: int) -> Tupl
|
||||
features = collections.defaultdict(set, copy.copy(features))
|
||||
|
||||
for rule in rules:
|
||||
res = rule.evaluate(features)
|
||||
res = rule.evaluate(features, short_circuit=True)
|
||||
if res:
|
||||
# we first matched the rule with short circuiting enabled.
|
||||
# this is much faster than without short circuiting.
|
||||
# however, we want to collect all results thoroughly,
|
||||
# so once we've found a match quickly,
|
||||
# go back and capture results without short circuiting.
|
||||
res = rule.evaluate(features, short_circuit=False)
|
||||
|
||||
# sanity check
|
||||
assert bool(res) is True
|
||||
|
||||
results[rule.name].append((va, res))
|
||||
# we need to update the current `features`
|
||||
# because subsequent iterations of this loop may use newly added features,
|
||||
|
||||
+12
-4
@@ -146,7 +146,7 @@ class Feature:
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
def evaluate(self, ctx: Dict["Feature", Set[int]]) -> Result:
|
||||
def evaluate(self, ctx: Dict["Feature", Set[int]], **kwargs) -> Result:
|
||||
capa.perf.counters["evaluate.feature"] += 1
|
||||
capa.perf.counters["evaluate.feature." + self.name] += 1
|
||||
return Result(self in ctx, self, [], locations=ctx.get(self, []))
|
||||
@@ -192,7 +192,7 @@ class Substring(String):
|
||||
super(Substring, self).__init__(value, description=description)
|
||||
self.value = value
|
||||
|
||||
def evaluate(self, ctx):
|
||||
def evaluate(self, ctx, short_circuit=True):
|
||||
capa.perf.counters["evaluate.feature"] += 1
|
||||
capa.perf.counters["evaluate.feature.substring"] += 1
|
||||
|
||||
@@ -210,6 +210,10 @@ class Substring(String):
|
||||
|
||||
if self.value in feature.value:
|
||||
matches[feature.value].extend(locations)
|
||||
if short_circuit:
|
||||
# we found one matching string, thats sufficient to match.
|
||||
# don't collect other matching strings in this mode.
|
||||
break
|
||||
|
||||
if matches:
|
||||
# finalize: defaultdict -> dict
|
||||
@@ -280,7 +284,7 @@ class Regex(String):
|
||||
"invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % value
|
||||
)
|
||||
|
||||
def evaluate(self, ctx):
|
||||
def evaluate(self, ctx, short_circuit=True):
|
||||
capa.perf.counters["evaluate.feature"] += 1
|
||||
capa.perf.counters["evaluate.feature.regex"] += 1
|
||||
|
||||
@@ -302,6 +306,10 @@ class Regex(String):
|
||||
# so that they don't have to prefix/suffix their terms like: /.*foo.*/.
|
||||
if self.re.search(feature.value):
|
||||
matches[feature.value].extend(locations)
|
||||
if short_circuit:
|
||||
# we found one matching string, thats sufficient to match.
|
||||
# don't collect other matching strings in this mode.
|
||||
break
|
||||
|
||||
if matches:
|
||||
# finalize: defaultdict -> dict
|
||||
@@ -366,7 +374,7 @@ class Bytes(Feature):
|
||||
super(Bytes, self).__init__(value, description=description)
|
||||
self.value = value
|
||||
|
||||
def evaluate(self, ctx):
|
||||
def evaluate(self, ctx, **kwargs):
|
||||
capa.perf.counters["evaluate.feature"] += 1
|
||||
capa.perf.counters["evaluate.feature.bytes"] += 1
|
||||
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
import logging
|
||||
|
||||
import capa.engine as ceng
|
||||
import capa.features.common
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_node_cost(node):
|
||||
if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)):
|
||||
# we assume these are the most restrictive features:
|
||||
# authors commonly use them at the start of rules to restrict the category of samples to inspect
|
||||
return 0
|
||||
|
||||
# elif "everything else":
|
||||
# return 1
|
||||
#
|
||||
# this should be all hash-lookup features.
|
||||
# see below.
|
||||
|
||||
elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes)):
|
||||
# substring and regex features require a full scan of each string
|
||||
# which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count).
|
||||
#
|
||||
# TODO: compute the average cost of these feature relative to hash feature
|
||||
# and adjust the factor accordingly.
|
||||
return 2
|
||||
|
||||
elif isinstance(node, (ceng.Not, ceng.Range)):
|
||||
# the cost of these nodes are defined by the complexity of their single child.
|
||||
return 1 + get_node_cost(node.child)
|
||||
|
||||
elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)):
|
||||
# the cost of these nodes is the full cost of their children
|
||||
# as this is the worst-case scenario.
|
||||
return 1 + sum(map(get_node_cost, node.children))
|
||||
|
||||
else:
|
||||
# this should be all hash-lookup features.
|
||||
# we give this a arbitrary weight of 1.
|
||||
# the only thing more "important" than this is checking OS/Arch/Format.
|
||||
return 1
|
||||
|
||||
|
||||
def optimize_statement(statement):
|
||||
# this routine operates in-place
|
||||
|
||||
if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)):
|
||||
# has .children
|
||||
statement.children = sorted(statement.children, key=lambda n: get_node_cost(n))
|
||||
return
|
||||
elif isinstance(statement, (ceng.Not, ceng.Range)):
|
||||
# has .child
|
||||
optimize_statement(statement.child)
|
||||
return
|
||||
else:
|
||||
# appears to be "simple"
|
||||
return
|
||||
|
||||
|
||||
def optimize_rule(rule):
|
||||
# this routine operates in-place
|
||||
optimize_statement(rule.statement)
|
||||
|
||||
|
||||
def optimize_rules(rules):
|
||||
logger.debug("optimizing %d rules", len(rules))
|
||||
for rule in rules:
|
||||
optimize_rule(rule)
|
||||
return rules
|
||||
+5
-2
@@ -31,6 +31,7 @@ import ruamel.yaml
|
||||
import capa.perf
|
||||
import capa.engine as ceng
|
||||
import capa.features
|
||||
import capa.optimizer
|
||||
import capa.features.file
|
||||
import capa.features.insn
|
||||
import capa.features.common
|
||||
@@ -627,10 +628,10 @@ class Rule:
|
||||
for new_rule in self._extract_subscope_rules_rec(self.statement):
|
||||
yield new_rule
|
||||
|
||||
def evaluate(self, features: FeatureSet):
|
||||
def evaluate(self, features: FeatureSet, short_circuit=True):
|
||||
capa.perf.counters["evaluate.feature"] += 1
|
||||
capa.perf.counters["evaluate.feature.rule"] += 1
|
||||
return self.statement.evaluate(features)
|
||||
return self.statement.evaluate(features, short_circuit=short_circuit)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d, definition):
|
||||
@@ -968,6 +969,8 @@ class RuleSet:
|
||||
if len(rules) == 0:
|
||||
raise InvalidRuleSet("no rules selected")
|
||||
|
||||
rules = capa.optimizer.optimize_rules(rules)
|
||||
|
||||
self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE)
|
||||
self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE)
|
||||
self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE)
|
||||
|
||||
+37
-2
@@ -1,3 +1,34 @@
|
||||
"""
|
||||
Invoke capa multiple times and record profiling informations.
|
||||
Use the --number and --repeat options to change the number of iterations.
|
||||
By default, the script will emit a markdown table with a label pulled from git.
|
||||
|
||||
Note: you can run this script against pre-generated .frz files to reduce the startup time.
|
||||
|
||||
usage:
|
||||
|
||||
usage: profile-time.py [--number NUMBER] [--repeat REPEAT] [--label LABEL] sample
|
||||
|
||||
Profile capa performance
|
||||
|
||||
positional arguments:
|
||||
sample path to sample to analyze
|
||||
|
||||
optional arguments:
|
||||
--number NUMBER batch size of profile collection
|
||||
--repeat REPEAT batch count of profile collection
|
||||
--label LABEL description of the profile collection
|
||||
|
||||
example:
|
||||
|
||||
$ python profile-time.py ./tests/data/kernel32.dll_.frz --number 1 --repeat 2
|
||||
|
||||
| label | count(evaluations) | avg(time) | min(time) | max(time) |
|
||||
|--------------------------------------|----------------------|-------------|-------------|-------------|
|
||||
| 18c30e4 main: remove perf debug msgs | 66,561,622 | 132.13s | 125.14s | 139.12s |
|
||||
|
||||
^^^ --label or git hash
|
||||
"""
|
||||
import sys
|
||||
import timeit
|
||||
import logging
|
||||
@@ -98,12 +129,16 @@ def main(argv=None):
|
||||
(
|
||||
args.label,
|
||||
"{:,}".format(capa.perf.counters["evaluate.feature"]),
|
||||
"%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)),
|
||||
# python documentation indicates that min(samples) should be preferred,
|
||||
# so lets put that first.
|
||||
#
|
||||
# https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat
|
||||
"%0.2fs" % (min(samples) / float(args.number)),
|
||||
"%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)),
|
||||
"%0.2fs" % (max(samples) / float(args.number)),
|
||||
)
|
||||
],
|
||||
headers=["label", "count(evaluations)", "avg(time)", "min(time)", "max(time)"],
|
||||
headers=["label", "count(evaluations)", "min(time)", "avg(time)", "max(time)"],
|
||||
tablefmt="github",
|
||||
)
|
||||
)
|
||||
|
||||
@@ -533,3 +533,29 @@ def test_render_offset():
|
||||
assert str(capa.features.insn.Offset(1)) == "offset(0x1)"
|
||||
assert str(capa.features.insn.Offset(1, bitness=capa.features.common.BITNESS_X32)) == "offset/x32(0x1)"
|
||||
assert str(capa.features.insn.Offset(1, bitness=capa.features.common.BITNESS_X64)) == "offset/x64(0x1)"
|
||||
|
||||
|
||||
def test_short_circuit():
|
||||
assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True
|
||||
|
||||
# with short circuiting, only the children up until the first satisfied child are captured.
|
||||
assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}, short_circuit=True).children) == 1
|
||||
assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}, short_circuit=False).children) == 2
|
||||
|
||||
|
||||
def test_eval_order():
|
||||
# base cases.
|
||||
assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True
|
||||
assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}) == True
|
||||
|
||||
# with short circuiting, only the children up until the first satisfied child are captured.
|
||||
assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children) == 1
|
||||
assert len(Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children) == 2
|
||||
assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}, Number(2): {1}}).children) == 1
|
||||
|
||||
# and its guaranteed that children are evaluated in order.
|
||||
assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children[0].statement == Number(1)
|
||||
assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children[0].statement != Number(2)
|
||||
|
||||
assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children[1].statement == Number(2)
|
||||
assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children[1].statement != Number(1)
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
# Copyright (C) 2021 FireEye, Inc. All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at: [package root]/LICENSE.txt
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
|
||||
import capa.rules
|
||||
import capa.engine
|
||||
import capa.optimizer
|
||||
import capa.features.common
|
||||
from capa.engine import Or, And
|
||||
from capa.features.insn import Mnemonic
|
||||
from capa.features.common import Arch, Bytes, Substring
|
||||
|
||||
|
||||
def test_optimizer_order():
|
||||
rule = textwrap.dedent(
|
||||
"""
|
||||
rule:
|
||||
meta:
|
||||
name: test rule
|
||||
scope: function
|
||||
features:
|
||||
- and:
|
||||
- substring: "foo"
|
||||
- arch: amd64
|
||||
- mnemonic: cmp
|
||||
- and:
|
||||
- bytes: 3
|
||||
- offset: 2
|
||||
- or:
|
||||
- number: 1
|
||||
- offset: 4
|
||||
"""
|
||||
)
|
||||
r = capa.rules.Rule.from_yaml(rule)
|
||||
|
||||
# before optimization
|
||||
children = list(r.statement.get_children())
|
||||
assert isinstance(children[0], Substring)
|
||||
assert isinstance(children[1], Arch)
|
||||
assert isinstance(children[2], Mnemonic)
|
||||
assert isinstance(children[3], And)
|
||||
assert isinstance(children[4], Or)
|
||||
|
||||
# after optimization
|
||||
capa.optimizer.optimize_rules([r])
|
||||
children = list(r.statement.get_children())
|
||||
|
||||
# cost: 0
|
||||
assert isinstance(children[0], Arch)
|
||||
# cost: 1
|
||||
assert isinstance(children[1], Mnemonic)
|
||||
# cost: 2
|
||||
assert isinstance(children[2], Substring)
|
||||
# cost: 3
|
||||
assert isinstance(children[3], Or)
|
||||
# cost: 4
|
||||
assert isinstance(children[4], And)
|
||||
Reference in New Issue
Block a user