mirror of
https://github.com/mandiant/capa.git
synced 2025-12-08 05:40:36 -08:00
71 lines
2.3 KiB
Python
71 lines
2.3 KiB
Python
import logging
|
|
|
|
import capa.engine as ceng
|
|
import capa.features.common
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_node_cost(node):
|
|
if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)):
|
|
# we assume these are the most restrictive features:
|
|
# authors commonly use them at the start of rules to restrict the category of samples to inspect
|
|
return 0
|
|
|
|
# elif "everything else":
|
|
# return 1
|
|
#
|
|
# this should be all hash-lookup features.
|
|
# see below.
|
|
|
|
elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes)):
|
|
# substring and regex features require a full scan of each string
|
|
# which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count).
|
|
#
|
|
# TODO: compute the average cost of these feature relative to hash feature
|
|
# and adjust the factor accordingly.
|
|
return 2
|
|
|
|
elif isinstance(node, (ceng.Not, ceng.Range)):
|
|
# the cost of these nodes are defined by the complexity of their single child.
|
|
return 1 + get_node_cost(node.child)
|
|
|
|
elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)):
|
|
# the cost of these nodes is the full cost of their children
|
|
# as this is the worst-case scenario.
|
|
return 1 + sum(map(get_node_cost, node.children))
|
|
|
|
else:
|
|
# this should be all hash-lookup features.
|
|
# we give this a arbitrary weight of 1.
|
|
# the only thing more "important" than this is checking OS/Arch/Format.
|
|
return 1
|
|
|
|
|
|
def optimize_statement(statement):
|
|
# this routine operates in-place
|
|
|
|
if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)):
|
|
# has .children
|
|
statement.children = sorted(statement.children, key=get_node_cost)
|
|
return
|
|
elif isinstance(statement, (ceng.Not, ceng.Range)):
|
|
# has .child
|
|
optimize_statement(statement.child)
|
|
return
|
|
else:
|
|
# appears to be "simple"
|
|
return
|
|
|
|
|
|
def optimize_rule(rule):
|
|
# this routine operates in-place
|
|
optimize_statement(rule.statement)
|
|
|
|
|
|
def optimize_rules(rules):
|
|
logger.debug("optimizing %d rules", len(rules))
|
|
for rule in rules:
|
|
optimize_rule(rule)
|
|
return rules
|