mirror of
https://github.com/mandiant/capa.git
synced 2026-03-12 21:23:12 -07:00
perf: eliminate O(n²) tuple growth and reduce per-match overhead (#2890)
* perf: eliminate O(n²) tuple growth and reduce per-match overhead Four data-driven performance improvements identified by profiling the hot paths in capa's rule-matching and capability-finding pipeline: 1. find_static_capabilities / find_dynamic_capabilities (O(n²) → O(n)) Tuple concatenation with `t += (item,)` copies the entire tuple on every iteration. For a binary with N functions this allocates O(N²) total objects. Replace with list accumulation and a single `tuple(list)` conversion at the end. 2. RuleSet._match: pre-compute rule_index_by_rule_name (O(n) → O(1)) `_match` is called once per instruction / basic-block / function scope (potentially millions of times). Previously it rebuilt the name→index dict on every call. The dict is now computed once in `__init__` and stored as `_rule_index_by_scope`, reducing each call to a dict lookup. 3. RuleSet._match: candidate_rules.pop(0) → deque.popleft() (O(n) → O(1)) `list.pop(0)` is O(n) because it shifts every remaining element. Switch to `collections.deque` for O(1) left-side consumption. 4. RuleSet._extract_subscope_rules: list.pop(0) → deque.popleft() (O(n²) → O(n)) Same issue: BFS over rules used list.pop(0), making the whole loop quadratic. Changed to a deque queue for linear-time processing. Fixes #2880 * perf: use sorted merge instead of full re-sort for new rule candidates When a rule matches and introduces new dependent candidates into _match's work queue, the previous approach converted the deque to a list, extended it with the new items, and re-sorted the whole collection — O((k+m) log(k+m)). Because the existing deque is already topologically sorted, we only need to sort the new additions — O(m log m) — and then merge the two sorted sequences in O(k+m) using heapq.merge. Also adds a CHANGELOG entry for the performance improvements in #2890. * perf: simplify candidate_rules to LIFO list, revert heapq.merge Address reviewer feedback: - Replace deque+popleft with list+pop (LIFO stack) in _extract_subscope_rules; processing order doesn't affect correctness, and list.pop() is O(1). - Replace deque+popleft with list+pop (LIFO stack) in _match; sort candidate rules descending so pop() from the end yields the topologically-first rule. - Revert heapq.merge back to the simpler extend+re-sort pattern; the added complexity wasn't justified given the typically small candidate set. - Remove now-unused `import heapq`.
This commit is contained in:
committed by
GitHub
parent
8c138e3d22
commit
2c9e30c3e1
@@ -52,6 +52,10 @@
|
|||||||
|
|
||||||
### capa Explorer IDA Pro plugin
|
### capa Explorer IDA Pro plugin
|
||||||
|
|
||||||
|
### Performance
|
||||||
|
|
||||||
|
- perf: eliminate O(n²) tuple growth and reduce per-match overhead @devs6186 #2890
|
||||||
|
|
||||||
### Development
|
### Development
|
||||||
|
|
||||||
- doc: document that default output shows top-level matches only; -v/-vv show nested matches @devs6186 #1410
|
- doc: document that default output shows top-level matches only; -v/-vv show nested matches @devs6186 #1410
|
||||||
|
|||||||
@@ -277,7 +277,9 @@ def find_dynamic_capabilities(
|
|||||||
all_span_matches: MatchResults = collections.defaultdict(list)
|
all_span_matches: MatchResults = collections.defaultdict(list)
|
||||||
all_call_matches: MatchResults = collections.defaultdict(list)
|
all_call_matches: MatchResults = collections.defaultdict(list)
|
||||||
|
|
||||||
feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=())
|
# Accumulate into a list to avoid O(n²) tuple concatenation.
|
||||||
|
# Tuples are immutable, so `t += (x,)` copies the entire tuple each time.
|
||||||
|
process_feature_counts: list[rdoc.ProcessFeatureCount] = []
|
||||||
|
|
||||||
assert isinstance(extractor, DynamicFeatureExtractor)
|
assert isinstance(extractor, DynamicFeatureExtractor)
|
||||||
processes: list[ProcessHandle] = list(extractor.get_processes())
|
processes: list[ProcessHandle] = list(extractor.get_processes())
|
||||||
@@ -289,10 +291,10 @@ def find_dynamic_capabilities(
|
|||||||
task = pbar.add_task("matching", total=n_processes, unit="processes")
|
task = pbar.add_task("matching", total=n_processes, unit="processes")
|
||||||
for p in processes:
|
for p in processes:
|
||||||
process_capabilities = find_process_capabilities(ruleset, extractor, p)
|
process_capabilities = find_process_capabilities(ruleset, extractor, p)
|
||||||
feature_counts.processes += (
|
process_feature_counts.append(
|
||||||
rdoc.ProcessFeatureCount(
|
rdoc.ProcessFeatureCount(
|
||||||
address=frz.Address.from_capa(p.address), count=process_capabilities.feature_count
|
address=frz.Address.from_capa(p.address), count=process_capabilities.feature_count
|
||||||
),
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
for rule_name, res in process_capabilities.process_matches.items():
|
for rule_name, res in process_capabilities.process_matches.items():
|
||||||
@@ -317,7 +319,11 @@ def find_dynamic_capabilities(
|
|||||||
capa.engine.index_rule_matches(process_and_lower_features, rule, locations)
|
capa.engine.index_rule_matches(process_and_lower_features, rule, locations)
|
||||||
|
|
||||||
all_file_capabilities = find_file_capabilities(ruleset, extractor, process_and_lower_features)
|
all_file_capabilities = find_file_capabilities(ruleset, extractor, process_and_lower_features)
|
||||||
feature_counts.file = all_file_capabilities.feature_count
|
|
||||||
|
feature_counts = rdoc.DynamicFeatureCounts(
|
||||||
|
file=all_file_capabilities.feature_count,
|
||||||
|
processes=tuple(process_feature_counts),
|
||||||
|
)
|
||||||
|
|
||||||
matches = dict(
|
matches = dict(
|
||||||
itertools.chain(
|
itertools.chain(
|
||||||
|
|||||||
@@ -156,8 +156,11 @@ def find_static_capabilities(
|
|||||||
all_bb_matches: MatchResults = collections.defaultdict(list)
|
all_bb_matches: MatchResults = collections.defaultdict(list)
|
||||||
all_insn_matches: MatchResults = collections.defaultdict(list)
|
all_insn_matches: MatchResults = collections.defaultdict(list)
|
||||||
|
|
||||||
feature_counts = rdoc.StaticFeatureCounts(file=0, functions=())
|
# Accumulate into lists to avoid O(n²) tuple concatenation.
|
||||||
library_functions: tuple[rdoc.LibraryFunction, ...] = ()
|
# Tuples are immutable, so `t += (x,)` copies the entire tuple each time.
|
||||||
|
# For binaries with thousands of functions this becomes quadratic in memory work.
|
||||||
|
function_feature_counts: list[rdoc.FunctionFeatureCount] = []
|
||||||
|
library_functions_list: list[rdoc.LibraryFunction] = []
|
||||||
|
|
||||||
assert isinstance(extractor, StaticFeatureExtractor)
|
assert isinstance(extractor, StaticFeatureExtractor)
|
||||||
functions: list[FunctionHandle] = list(extractor.get_functions())
|
functions: list[FunctionHandle] = list(extractor.get_functions())
|
||||||
@@ -176,20 +179,20 @@ def find_static_capabilities(
|
|||||||
if extractor.is_library_function(f.address):
|
if extractor.is_library_function(f.address):
|
||||||
function_name = extractor.get_function_name(f.address)
|
function_name = extractor.get_function_name(f.address)
|
||||||
logger.debug("skipping library function 0x%x (%s)", f.address, function_name)
|
logger.debug("skipping library function 0x%x (%s)", f.address, function_name)
|
||||||
library_functions += (
|
library_functions_list.append(
|
||||||
rdoc.LibraryFunction(address=frz.Address.from_capa(f.address), name=function_name),
|
rdoc.LibraryFunction(address=frz.Address.from_capa(f.address), name=function_name)
|
||||||
)
|
)
|
||||||
n_libs = len(library_functions)
|
n_libs = len(library_functions_list)
|
||||||
percentage = round(100 * (n_libs / n_funcs))
|
percentage = round(100 * (n_libs / n_funcs))
|
||||||
pbar.update(task, postfix=f"skipped {n_libs} library functions, {percentage}%")
|
pbar.update(task, postfix=f"skipped {n_libs} library functions, {percentage}%")
|
||||||
pbar.advance(task)
|
pbar.advance(task)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
code_capabilities = find_code_capabilities(ruleset, extractor, f)
|
code_capabilities = find_code_capabilities(ruleset, extractor, f)
|
||||||
feature_counts.functions += (
|
function_feature_counts.append(
|
||||||
rdoc.FunctionFeatureCount(
|
rdoc.FunctionFeatureCount(
|
||||||
address=frz.Address.from_capa(f.address), count=code_capabilities.feature_count
|
address=frz.Address.from_capa(f.address), count=code_capabilities.feature_count
|
||||||
),
|
)
|
||||||
)
|
)
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
|
|
||||||
@@ -230,7 +233,11 @@ def find_static_capabilities(
|
|||||||
capa.engine.index_rule_matches(function_and_lower_features, rule, locations)
|
capa.engine.index_rule_matches(function_and_lower_features, rule, locations)
|
||||||
|
|
||||||
all_file_capabilities = find_file_capabilities(ruleset, extractor, function_and_lower_features)
|
all_file_capabilities = find_file_capabilities(ruleset, extractor, function_and_lower_features)
|
||||||
feature_counts.file = all_file_capabilities.feature_count
|
|
||||||
|
feature_counts = rdoc.StaticFeatureCounts(
|
||||||
|
file=all_file_capabilities.feature_count,
|
||||||
|
functions=tuple(function_feature_counts),
|
||||||
|
)
|
||||||
|
|
||||||
matches: MatchResults = dict(
|
matches: MatchResults = dict(
|
||||||
itertools.chain(
|
itertools.chain(
|
||||||
@@ -244,4 +251,4 @@ def find_static_capabilities(
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
return Capabilities(matches, feature_counts, library_functions)
|
return Capabilities(matches, feature_counts, tuple(library_functions_list))
|
||||||
|
|||||||
@@ -1447,6 +1447,13 @@ class RuleSet:
|
|||||||
scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes
|
scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Pre-compute the topological index mapping for each scope.
|
||||||
|
# This avoids rebuilding the dict on every call to _match (which runs once per
|
||||||
|
# instruction/basic-block/function/file scope, i.e. potentially millions of times).
|
||||||
|
self._rule_index_by_scope: dict[Scope, dict[str, int]] = {
|
||||||
|
scope: {rule.name: i for i, rule in enumerate(self.rules_by_scope[scope])} for scope in scopes
|
||||||
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def file_rules(self):
|
def file_rules(self):
|
||||||
return self.rules_by_scope[Scope.FILE]
|
return self.rules_by_scope[Scope.FILE]
|
||||||
@@ -1876,11 +1883,13 @@ class RuleSet:
|
|||||||
"""
|
"""
|
||||||
done = []
|
done = []
|
||||||
|
|
||||||
# use a queue of rules, because we'll be modifying the list (appending new items) as we go.
|
# use a list as a stack: append new items and pop() from the end, both O(1).
|
||||||
while rules:
|
# order doesn't matter here since every rule in the queue is processed eventually.
|
||||||
rule = rules.pop(0)
|
rules_stack = list(rules)
|
||||||
|
while rules_stack:
|
||||||
|
rule = rules_stack.pop()
|
||||||
for subscope_rule in rule.extract_subscope_rules():
|
for subscope_rule in rule.extract_subscope_rules():
|
||||||
rules.append(subscope_rule)
|
rules_stack.append(subscope_rule)
|
||||||
done.append(rule)
|
done.append(rule)
|
||||||
|
|
||||||
return done
|
return done
|
||||||
@@ -1929,11 +1938,11 @@ class RuleSet:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
feature_index: RuleSet._RuleFeatureIndex = self._feature_indexes_by_scopes[scope]
|
feature_index: RuleSet._RuleFeatureIndex = self._feature_indexes_by_scopes[scope]
|
||||||
rules: list[Rule] = self.rules_by_scope[scope]
|
|
||||||
# Topologic location of rule given its name.
|
# Topologic location of rule given its name.
|
||||||
# That is, rules with a lower index should be evaluated first, since their dependencies
|
# That is, rules with a lower index should be evaluated first, since their dependencies
|
||||||
# will be evaluated later.
|
# will be evaluated later.
|
||||||
rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)}
|
# Pre-computed in __init__ to avoid rebuilding on every _match call.
|
||||||
|
rule_index_by_rule_name = self._rule_index_by_scope[scope]
|
||||||
|
|
||||||
# This algorithm is optimized to evaluate as few rules as possible,
|
# This algorithm is optimized to evaluate as few rules as possible,
|
||||||
# because the less work we do, the faster capa can run.
|
# because the less work we do, the faster capa can run.
|
||||||
@@ -2029,7 +2038,9 @@ class RuleSet:
|
|||||||
candidate_rules = [self.rules[name] for name in candidate_rule_names]
|
candidate_rules = [self.rules[name] for name in candidate_rule_names]
|
||||||
|
|
||||||
# Order rules topologically, so that rules with dependencies work correctly.
|
# Order rules topologically, so that rules with dependencies work correctly.
|
||||||
|
# Sort descending so pop() from the end yields the topologically-first rule in O(1).
|
||||||
RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
|
RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
|
||||||
|
candidate_rules.reverse()
|
||||||
|
|
||||||
#
|
#
|
||||||
# The following is derived from ceng.match
|
# The following is derived from ceng.match
|
||||||
@@ -2044,7 +2055,7 @@ class RuleSet:
|
|||||||
augmented_features = features
|
augmented_features = features
|
||||||
|
|
||||||
while candidate_rules:
|
while candidate_rules:
|
||||||
rule = candidate_rules.pop(0)
|
rule = candidate_rules.pop()
|
||||||
res = rule.evaluate(augmented_features, short_circuit=True)
|
res = rule.evaluate(augmented_features, short_circuit=True)
|
||||||
if res:
|
if res:
|
||||||
# we first matched the rule with short circuiting enabled.
|
# we first matched the rule with short circuiting enabled.
|
||||||
@@ -2083,6 +2094,7 @@ class RuleSet:
|
|||||||
candidate_rule_names.update(new_candidates)
|
candidate_rule_names.update(new_candidates)
|
||||||
candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates])
|
candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates])
|
||||||
RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
|
RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
|
||||||
|
candidate_rules.reverse()
|
||||||
|
|
||||||
return (augmented_features, results)
|
return (augmented_features, results)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user