From 2c9e30c3e16a8fe75c2fec520bf5da7a70cc72c3 Mon Sep 17 00:00:00 2001 From: Devyansh Somvanshi <144378426+devs6186@users.noreply.github.com> Date: Wed, 11 Mar 2026 01:51:48 +0530 Subject: [PATCH] =?UTF-8?q?perf:=20eliminate=20O(n=C2=B2)=20tuple=20growth?= =?UTF-8?q?=20and=20reduce=20per-match=20overhead=20(#2890)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * perf: eliminate O(n²) tuple growth and reduce per-match overhead Four data-driven performance improvements identified by profiling the hot paths in capa's rule-matching and capability-finding pipeline: 1. find_static_capabilities / find_dynamic_capabilities (O(n²) → O(n)) Tuple concatenation with `t += (item,)` copies the entire tuple on every iteration. For a binary with N functions this allocates O(N²) total objects. Replace with list accumulation and a single `tuple(list)` conversion at the end. 2. RuleSet._match: pre-compute rule_index_by_rule_name (O(n) → O(1)) `_match` is called once per instruction / basic-block / function scope (potentially millions of times). Previously it rebuilt the name→index dict on every call. The dict is now computed once in `__init__` and stored as `_rule_index_by_scope`, reducing each call to a dict lookup. 3. RuleSet._match: candidate_rules.pop(0) → deque.popleft() (O(n) → O(1)) `list.pop(0)` is O(n) because it shifts every remaining element. Switch to `collections.deque` for O(1) left-side consumption. 4. RuleSet._extract_subscope_rules: list.pop(0) → deque.popleft() (O(n²) → O(n)) Same issue: BFS over rules used list.pop(0), making the whole loop quadratic. Changed to a deque queue for linear-time processing. Fixes #2880 * perf: use sorted merge instead of full re-sort for new rule candidates When a rule matches and introduces new dependent candidates into _match's work queue, the previous approach converted the deque to a list, extended it with the new items, and re-sorted the whole collection — O((k+m) log(k+m)). Because the existing deque is already topologically sorted, we only need to sort the new additions — O(m log m) — and then merge the two sorted sequences in O(k+m) using heapq.merge. Also adds a CHANGELOG entry for the performance improvements in #2890. * perf: simplify candidate_rules to LIFO list, revert heapq.merge Address reviewer feedback: - Replace deque+popleft with list+pop (LIFO stack) in _extract_subscope_rules; processing order doesn't affect correctness, and list.pop() is O(1). - Replace deque+popleft with list+pop (LIFO stack) in _match; sort candidate rules descending so pop() from the end yields the topologically-first rule. - Revert heapq.merge back to the simpler extend+re-sort pattern; the added complexity wasn't justified given the typically small candidate set. - Remove now-unused `import heapq`. --- CHANGELOG.md | 4 ++++ capa/capabilities/dynamic.py | 14 ++++++++++---- capa/capabilities/static.py | 25 ++++++++++++++++--------- capa/rules/__init__.py | 26 +++++++++++++++++++------- 4 files changed, 49 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59402bb5..7e644691 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,10 @@ ### capa Explorer IDA Pro plugin +### Performance + +- perf: eliminate O(n²) tuple growth and reduce per-match overhead @devs6186 #2890 + ### Development - doc: document that default output shows top-level matches only; -v/-vv show nested matches @devs6186 #1410 diff --git a/capa/capabilities/dynamic.py b/capa/capabilities/dynamic.py index a84f5d3f..3c98a16c 100644 --- a/capa/capabilities/dynamic.py +++ b/capa/capabilities/dynamic.py @@ -277,7 +277,9 @@ def find_dynamic_capabilities( all_span_matches: MatchResults = collections.defaultdict(list) all_call_matches: MatchResults = collections.defaultdict(list) - feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=()) + # Accumulate into a list to avoid O(n²) tuple concatenation. + # Tuples are immutable, so `t += (x,)` copies the entire tuple each time. + process_feature_counts: list[rdoc.ProcessFeatureCount] = [] assert isinstance(extractor, DynamicFeatureExtractor) processes: list[ProcessHandle] = list(extractor.get_processes()) @@ -289,10 +291,10 @@ def find_dynamic_capabilities( task = pbar.add_task("matching", total=n_processes, unit="processes") for p in processes: process_capabilities = find_process_capabilities(ruleset, extractor, p) - feature_counts.processes += ( + process_feature_counts.append( rdoc.ProcessFeatureCount( address=frz.Address.from_capa(p.address), count=process_capabilities.feature_count - ), + ) ) for rule_name, res in process_capabilities.process_matches.items(): @@ -317,7 +319,11 @@ def find_dynamic_capabilities( capa.engine.index_rule_matches(process_and_lower_features, rule, locations) all_file_capabilities = find_file_capabilities(ruleset, extractor, process_and_lower_features) - feature_counts.file = all_file_capabilities.feature_count + + feature_counts = rdoc.DynamicFeatureCounts( + file=all_file_capabilities.feature_count, + processes=tuple(process_feature_counts), + ) matches = dict( itertools.chain( diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py index d485aa48..893887f7 100644 --- a/capa/capabilities/static.py +++ b/capa/capabilities/static.py @@ -156,8 +156,11 @@ def find_static_capabilities( all_bb_matches: MatchResults = collections.defaultdict(list) all_insn_matches: MatchResults = collections.defaultdict(list) - feature_counts = rdoc.StaticFeatureCounts(file=0, functions=()) - library_functions: tuple[rdoc.LibraryFunction, ...] = () + # Accumulate into lists to avoid O(n²) tuple concatenation. + # Tuples are immutable, so `t += (x,)` copies the entire tuple each time. + # For binaries with thousands of functions this becomes quadratic in memory work. + function_feature_counts: list[rdoc.FunctionFeatureCount] = [] + library_functions_list: list[rdoc.LibraryFunction] = [] assert isinstance(extractor, StaticFeatureExtractor) functions: list[FunctionHandle] = list(extractor.get_functions()) @@ -176,20 +179,20 @@ def find_static_capabilities( if extractor.is_library_function(f.address): function_name = extractor.get_function_name(f.address) logger.debug("skipping library function 0x%x (%s)", f.address, function_name) - library_functions += ( - rdoc.LibraryFunction(address=frz.Address.from_capa(f.address), name=function_name), + library_functions_list.append( + rdoc.LibraryFunction(address=frz.Address.from_capa(f.address), name=function_name) ) - n_libs = len(library_functions) + n_libs = len(library_functions_list) percentage = round(100 * (n_libs / n_funcs)) pbar.update(task, postfix=f"skipped {n_libs} library functions, {percentage}%") pbar.advance(task) continue code_capabilities = find_code_capabilities(ruleset, extractor, f) - feature_counts.functions += ( + function_feature_counts.append( rdoc.FunctionFeatureCount( address=frz.Address.from_capa(f.address), count=code_capabilities.feature_count - ), + ) ) t1 = time.time() @@ -230,7 +233,11 @@ def find_static_capabilities( capa.engine.index_rule_matches(function_and_lower_features, rule, locations) all_file_capabilities = find_file_capabilities(ruleset, extractor, function_and_lower_features) - feature_counts.file = all_file_capabilities.feature_count + + feature_counts = rdoc.StaticFeatureCounts( + file=all_file_capabilities.feature_count, + functions=tuple(function_feature_counts), + ) matches: MatchResults = dict( itertools.chain( @@ -244,4 +251,4 @@ def find_static_capabilities( ) ) - return Capabilities(matches, feature_counts, library_functions) + return Capabilities(matches, feature_counts, tuple(library_functions_list)) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index da0a7d03..1ca203e2 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1447,6 +1447,13 @@ class RuleSet: scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes } + # Pre-compute the topological index mapping for each scope. + # This avoids rebuilding the dict on every call to _match (which runs once per + # instruction/basic-block/function/file scope, i.e. potentially millions of times). + self._rule_index_by_scope: dict[Scope, dict[str, int]] = { + scope: {rule.name: i for i, rule in enumerate(self.rules_by_scope[scope])} for scope in scopes + } + @property def file_rules(self): return self.rules_by_scope[Scope.FILE] @@ -1876,11 +1883,13 @@ class RuleSet: """ done = [] - # use a queue of rules, because we'll be modifying the list (appending new items) as we go. - while rules: - rule = rules.pop(0) + # use a list as a stack: append new items and pop() from the end, both O(1). + # order doesn't matter here since every rule in the queue is processed eventually. + rules_stack = list(rules) + while rules_stack: + rule = rules_stack.pop() for subscope_rule in rule.extract_subscope_rules(): - rules.append(subscope_rule) + rules_stack.append(subscope_rule) done.append(rule) return done @@ -1929,11 +1938,11 @@ class RuleSet: """ feature_index: RuleSet._RuleFeatureIndex = self._feature_indexes_by_scopes[scope] - rules: list[Rule] = self.rules_by_scope[scope] # Topologic location of rule given its name. # That is, rules with a lower index should be evaluated first, since their dependencies # will be evaluated later. - rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)} + # Pre-computed in __init__ to avoid rebuilding on every _match call. + rule_index_by_rule_name = self._rule_index_by_scope[scope] # This algorithm is optimized to evaluate as few rules as possible, # because the less work we do, the faster capa can run. @@ -2029,7 +2038,9 @@ class RuleSet: candidate_rules = [self.rules[name] for name in candidate_rule_names] # Order rules topologically, so that rules with dependencies work correctly. + # Sort descending so pop() from the end yields the topologically-first rule in O(1). RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules) + candidate_rules.reverse() # # The following is derived from ceng.match @@ -2044,7 +2055,7 @@ class RuleSet: augmented_features = features while candidate_rules: - rule = candidate_rules.pop(0) + rule = candidate_rules.pop() res = rule.evaluate(augmented_features, short_circuit=True) if res: # we first matched the rule with short circuiting enabled. @@ -2083,6 +2094,7 @@ class RuleSet: candidate_rule_names.update(new_candidates) candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates]) RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules) + candidate_rules.reverse() return (augmented_features, results)