mirror of
https://github.com/mandiant/capa.git
synced 2026-02-04 11:07:53 -08:00
Merge pull request #829 from mandiant/perf/query-optimizer
perf: add query optimizer
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
### New Features
|
||||
|
||||
- engine: short circuit logic nodes for better performance #824 @williballenthin
|
||||
- engine: add optimizer the order faster nodes first #829 @williballenthin
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
|
||||
70
capa/optimizer.py
Normal file
70
capa/optimizer.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import logging
|
||||
|
||||
import capa.engine as ceng
|
||||
import capa.features.common
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_node_cost(node):
|
||||
if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)):
|
||||
# we assume these are the most restrictive features:
|
||||
# authors commonly use them at the start of rules to restrict the category of samples to inspect
|
||||
return 0
|
||||
|
||||
# elif "everything else":
|
||||
# return 1
|
||||
#
|
||||
# this should be all hash-lookup features.
|
||||
# see below.
|
||||
|
||||
elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)):
|
||||
# substring and regex features require a full scan of each string
|
||||
# which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count).
|
||||
#
|
||||
# TODO: compute the average cost of these feature relative to hash feature
|
||||
# and adjust the factor accordingly.
|
||||
return 2
|
||||
|
||||
elif isinstance(node, (ceng.Not, ceng.Range)):
|
||||
# the cost of these nodes are defined by the complexity of their single child.
|
||||
return get_node_cost(node.child)
|
||||
|
||||
elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)):
|
||||
# the cost of these nodes is the full cost of their children
|
||||
# as this is the worst-case scenario.
|
||||
return sum(map(get_node_cost, node.children))
|
||||
|
||||
else:
|
||||
# this should be all hash-lookup features.
|
||||
# we give this a arbitrary weight of 1.
|
||||
# the only thing more "important" than this is checking OS/Arch/Format.
|
||||
return 1
|
||||
|
||||
|
||||
def optimize_statement(statement):
|
||||
# this routine operates in-place
|
||||
|
||||
if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)):
|
||||
# has .children
|
||||
statement.children = sorted(statement.children, key=lambda n: get_node_cost(n))
|
||||
return
|
||||
elif isinstance(statement, (ceng.Not, ceng.Range)):
|
||||
# has .child
|
||||
optimize_statement(statement.child)
|
||||
return
|
||||
else:
|
||||
# appears to be "simple"
|
||||
return
|
||||
|
||||
|
||||
def optimize_rule(rule):
|
||||
# this routine operates in-place
|
||||
optimize_statement(rule.statement)
|
||||
|
||||
|
||||
def optimize_rules(rules):
|
||||
logger.debug("optimizing %d rules", len(rules))
|
||||
for rule in rules:
|
||||
optimize_rule(rule)
|
||||
return rules
|
||||
@@ -30,6 +30,7 @@ import ruamel.yaml
|
||||
import capa.perf
|
||||
import capa.engine as ceng
|
||||
import capa.features
|
||||
import capa.optimizer
|
||||
import capa.features.file
|
||||
import capa.features.insn
|
||||
import capa.features.common
|
||||
@@ -961,6 +962,8 @@ class RuleSet:
|
||||
if len(rules) == 0:
|
||||
raise InvalidRuleSet("no rules selected")
|
||||
|
||||
rules = capa.optimizer.optimize_rules(rules)
|
||||
|
||||
self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE)
|
||||
self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE)
|
||||
self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE)
|
||||
|
||||
65
tests/test_optimizer.py
Normal file
65
tests/test_optimizer.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# Copyright (C) 2021 FireEye, Inc. All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at: [package root]/LICENSE.txt
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
|
||||
import capa.rules
|
||||
import capa.engine
|
||||
import capa.optimizer
|
||||
import capa.features.common
|
||||
from capa.engine import Or, And
|
||||
from capa.features.insn import Mnemonic
|
||||
from capa.features.common import Arch, Bytes, Substring
|
||||
|
||||
|
||||
def test_optimizer_order():
|
||||
rule = textwrap.dedent(
|
||||
"""
|
||||
rule:
|
||||
meta:
|
||||
name: test rule
|
||||
scope: function
|
||||
features:
|
||||
- and:
|
||||
- substring: "foo"
|
||||
- arch: amd64
|
||||
- mnemonic: cmp
|
||||
- and:
|
||||
- bytes: 3
|
||||
- offset: 2
|
||||
- or:
|
||||
- number: 1
|
||||
- offset: 4
|
||||
"""
|
||||
)
|
||||
r = capa.rules.Rule.from_yaml(rule)
|
||||
|
||||
# before optimization
|
||||
children = list(r.statement.get_children())
|
||||
assert isinstance(children[0], Substring)
|
||||
assert isinstance(children[1], Arch)
|
||||
assert isinstance(children[2], Mnemonic)
|
||||
assert isinstance(children[3], And)
|
||||
assert isinstance(children[4], Or)
|
||||
|
||||
# after optimization
|
||||
capa.optimizer.optimize_rules([r])
|
||||
children = list(r.statement.get_children())
|
||||
|
||||
# cost: 0
|
||||
assert isinstance(children[0], Arch)
|
||||
# cost: 1
|
||||
assert isinstance(children[1], Mnemonic)
|
||||
# cost: 2
|
||||
assert isinstance(children[2], Substring)
|
||||
# cost: 3
|
||||
assert isinstance(children[3], Or)
|
||||
# cost: 4
|
||||
assert isinstance(children[4], And)
|
||||
Reference in New Issue
Block a user