From 4b7a9e149fa90f4b76cdafae4642dfad95ba6ed2 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 13:27:30 +0100 Subject: [PATCH 01/31] rules: move to directory structure --- capa/{rules.py => rules/__init__.py} | 0 tests/test_rules.py | 1 - 2 files changed, 1 deletion(-) rename capa/{rules.py => rules/__init__.py} (100%) diff --git a/capa/rules.py b/capa/rules/__init__.py similarity index 100% rename from capa/rules.py rename to capa/rules/__init__.py diff --git a/tests/test_rules.py b/tests/test_rules.py index 466ac306..fe154c39 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -31,7 +31,6 @@ from capa.features.common import ( Substring, FeatureAccess, ) -from capa.features.address import AbsoluteVirtualAddress ADDR1 = capa.features.address.AbsoluteVirtualAddress(0x401001) ADDR2 = capa.features.address.AbsoluteVirtualAddress(0x401002) From 476ffabae9471b657d9f923063e23762b0eb22b0 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 14:50:00 +0100 Subject: [PATCH 02/31] rules: cache the ruleset to disk ref: #1212 --- capa/main.py | 43 ++++++-- capa/rules/cache.py | 121 +++++++++++++++++++++++ scripts/bulk-process.py | 2 +- scripts/capa2yara.py | 5 +- scripts/capa_as_library.py | 2 +- scripts/lint.py | 2 +- scripts/profile-time.py | 2 +- scripts/show-capabilities-by-function.py | 2 +- 8 files changed, 162 insertions(+), 17 deletions(-) create mode 100644 capa/rules/cache.py diff --git a/capa/main.py b/capa/main.py index b3a654a4..4543ce9e 100644 --- a/capa/main.py +++ b/capa/main.py @@ -33,6 +33,7 @@ import capa.rules import capa.engine import capa.version import capa.render.json +import capa.rules.cache import capa.render.default import capa.render.verbose import capa.features.common @@ -561,7 +562,10 @@ def is_nursery_rule_path(path: str) -> bool: return "nursery" in path -def get_rules(rule_paths: List[str], disable_progress=False) -> List[Rule]: +def collect_rule_file_paths(rule_paths: List[str]) -> List[str]: + """ + collect all rule file paths, including those in subdirectories. + """ rule_file_paths = [] for rule_path in rule_paths: if not os.path.exists(rule_path): @@ -589,6 +593,23 @@ def get_rules(rule_paths: List[str], disable_progress=False) -> List[Rule]: rule_path = os.path.join(root, file) rule_file_paths.append(rule_path) + return rule_file_paths + + +def get_rules(rule_paths: List[str], disable_progress=False) -> RuleSet: + rule_file_paths = collect_rule_file_paths(rule_paths) + + # this list is parallel to `rule_file_paths`: + # rule_file_paths[i] corresponds to rule_contents[i]. + rule_contents = [] + for file_path in rule_file_paths: + with open(file_path, "rb") as f: + rule_contents.append(f.read()) + + ruleset = capa.rules.cache.load_cached_ruleset(rule_contents) + if ruleset is not None: + return ruleset + rules = [] # type: List[Rule] pbar = tqdm.tqdm @@ -597,20 +618,24 @@ def get_rules(rule_paths: List[str], disable_progress=False) -> List[Rule]: # to disable progress completely pbar = lambda s, *args, **kwargs: s - for rule_file_path in pbar(list(rule_file_paths), desc="loading ", unit=" rules"): + for path, content in pbar(zip(rule_file_paths, rule_contents), desc="parsing ", unit=" rules"): try: - rule = capa.rules.Rule.from_yaml_file(rule_file_path) + rule = capa.rules.Rule.from_yaml(content) except capa.rules.InvalidRule: raise else: - rule.meta["capa/path"] = rule_file_path - if is_nursery_rule_path(rule_file_path): + rule.meta["capa/path"] = path + if is_nursery_rule_path(path): rule.meta["capa/nursery"] = True rules.append(rule) - logger.debug("loaded rule: '%s' with scope: %s", rule.name, rule.scope) + logger.debug("parsed rule: '%s' with scope: %s", rule.name, rule.scope) - return rules + ruleset = capa.rules.RuleSet(rules) + + capa.rules.cache.cache_ruleset(ruleset) + + return ruleset def get_signatures(sigs_path): @@ -1001,7 +1026,7 @@ def main(argv=None): return E_INVALID_FILE_TYPE try: - rules = capa.rules.RuleSet(get_rules(args.rules, disable_progress=args.quiet)) + rules = get_rules(args.rules, disable_progress=args.quiet) logger.debug( "successfully loaded %s rules", @@ -1151,7 +1176,7 @@ def ida_main(): rules_path = os.path.join(get_default_root(), "rules") logger.debug("rule path: %s", rules_path) - rules = capa.rules.RuleSet(get_rules([rules_path])) + rules = get_rules([rules_path]) meta = capa.ida.helpers.collect_metadata([rules_path]) diff --git a/capa/rules/cache.py b/capa/rules/cache.py new file mode 100644 index 00000000..ed20ec66 --- /dev/null +++ b/capa/rules/cache.py @@ -0,0 +1,121 @@ +import sys +import pickle +import hashlib +import logging +import os.path +from typing import List, Optional +from dataclasses import dataclass + +import capa.rules + +logger = logging.getLogger(__name__) + + +# TypeAlias. note: using `foo: TypeAlias = bar` is Python 3.10+ +CacheIdentifier = str + + +def compute_cache_identifier(rule_content: List[bytes]) -> CacheIdentifier: + hash = hashlib.sha256() + + # note that this changes with each release, + # so cache identifiers will never collide across releases. + version = capa.version.__version__ + + hash.update(version.encode("utf-8")) + hash.update(b"\x00") + + rule_hashes = list(sorted([hashlib.sha256(rule).hexdigest() for rule in rule_content])) + for rule_hash in rule_hashes: + hash.update(rule_hash.encode("ascii")) + hash.update(b"\x00") + + return hash.hexdigest() + + +def get_default_cache_directory() -> str: + # ref: https://github.com/mandiant/capa/issues/1212#issuecomment-1361259813 + # + # Linux: $XDG_CACHE_HOME/capa/ + # Windows: %LOCALAPPDATA%\flare\capa\cache + # MacOS: ~/Library/Caches/capa + + # ref: https://stackoverflow.com/a/8220141/87207 + if sys.platform == "linux" or sys.platform == "linux2": + directory = os.environ.get("XDG_CACHE_HOME", os.path.join(os.environ["HOME"], ".cache", "capa")) + elif sys.platform == "darwin": + directory = os.path.join(os.environ["HOME"], "Library", "Caches", "capa") + elif sys.platform == "win32": + directory = os.path.join(os.environ["LOCALAPPDATA"], "flare", "capa", "cache") + else: + raise NotImplementedError(f"unsupported platform: {sys.platform}") + + os.makedirs(directory, exist_ok=True) + + return directory + + +def get_default_cache_path(id: CacheIdentifier) -> str: + filename = "capa-" + id[:8] + ".cache" + return os.path.join(get_default_cache_directory(), filename) + + +MAGIC = b"capa" +VERSION = b"\x00\x00\x00\x01" + + +@dataclass +class RuleCache: + id: CacheIdentifier + ruleset: capa.rules.RuleSet + + def dump(self): + return MAGIC + VERSION + self.id.encode("ascii") + pickle.dumps(self) + + @staticmethod + def load(data): + assert data.startswith(MAGIC + VERSION) + + id = data[0x8:0x48].decode("ascii") + cache = pickle.loads(data[0x48:]) + + assert isinstance(cache, RuleCache) + assert cache.id == id + + return cache + + +def cache_ruleset(ruleset: capa.rules.RuleSet): + rule_contents = [] + for rule in ruleset.rules.values(): + if rule.is_subscope_rule(): + continue + with open(rule.meta["capa/path"], "rb") as f: + rule_contents.append(f.read()) + + id = compute_cache_identifier(rule_contents) + path = get_default_cache_path(id) + if os.path.exists(path): + logger.debug("rule set already cached to %s", path) + return + + cache = RuleCache(id, ruleset) + with open(path, "wb") as f: + f.write(cache.dump()) + + logger.debug("rule set cached to %s", path) + return + + +def load_cached_ruleset(rule_contents: List[bytes]) -> Optional[capa.rules.RuleSet]: + id = compute_cache_identifier(rule_contents) + path = get_default_cache_path(id) + if not os.path.exists(path): + logger.debug("rule set cache does not exist: %s", path) + return None + + logger.debug("loading rule set from cache: %s", path) + with open(path, "rb") as f: + buf = f.read() + cache = RuleCache.load(buf) + return cache.ruleset diff --git a/scripts/bulk-process.py b/scripts/bulk-process.py index b57928c6..c235116d 100644 --- a/scripts/bulk-process.py +++ b/scripts/bulk-process.py @@ -152,7 +152,7 @@ def main(argv=None): capa.main.handle_common_args(args) try: - rules = capa.rules.RuleSet(capa.main.get_rules(args.rules)) + rules = capa.main.get_rules(args.rules) logger.info("successfully loaded %s rules", len(rules)) except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: logger.error("%s", str(e)) diff --git a/scripts/capa2yara.py b/scripts/capa2yara.py index 7fd4ad41..97bc1f8a 100644 --- a/scripts/capa2yara.py +++ b/scripts/capa2yara.py @@ -709,9 +709,8 @@ def main(argv=None): logging.getLogger("capa2yara").setLevel(level) try: - rules_ = capa.main.get_rules([args.rules], disable_progress=True) - namespaces = capa.rules.index_rules_by_namespace(rules_) - rules = capa.rules.RuleSet(rules_) + rules = capa.main.get_rules([args.rules], disable_progress=True) + namespaces = capa.rules.index_rules_by_namespace(list(rules.rules.values())) logger.info("successfully loaded %s rules (including subscope rules which will be ignored)", len(rules)) if args.tag: rules = rules.filter_rules_by_meta(args.tag) diff --git a/scripts/capa_as_library.py b/scripts/capa_as_library.py index 07b408cc..04d4a307 100644 --- a/scripts/capa_as_library.py +++ b/scripts/capa_as_library.py @@ -161,7 +161,7 @@ def render_dictionary(doc: rd.ResultDocument) -> Dict[str, Any]: # ==== render dictionary helpers def capa_details(rules_path, file_path, output_format="dictionary"): # load rules from disk - rules = capa.rules.RuleSet(capa.main.get_rules([rules_path], disable_progress=True)) + rules = capa.main.get_rules([rules_path], disable_progress=True) # extract features and find capabilities extractor = capa.main.get_extractor(file_path, "auto", capa.main.BACKEND_VIV, [], False, disable_progress=True) diff --git a/scripts/lint.py b/scripts/lint.py index ed6e8b17..cf1fa871 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -1002,7 +1002,7 @@ def main(argv=None): time0 = time.time() try: - rules = capa.rules.RuleSet(capa.main.get_rules(args.rules, disable_progress=True)) + rules = capa.main.get_rules(args.rules, disable_progress=True) logger.info("successfully loaded %s rules", len(rules)) if args.tag: rules = rules.filter_rules_by_meta(args.tag) diff --git a/scripts/profile-time.py b/scripts/profile-time.py index 3d6b5e07..0b76902d 100644 --- a/scripts/profile-time.py +++ b/scripts/profile-time.py @@ -88,7 +88,7 @@ def main(argv=None): try: with capa.main.timing("load rules"): - rules = capa.rules.RuleSet(capa.main.get_rules(args.rules, disable_progress=True)) + rules = capa.main.get_rules(args.rules, disable_progress=True) except (IOError) as e: logger.error("%s", str(e)) return -1 diff --git a/scripts/show-capabilities-by-function.py b/scripts/show-capabilities-by-function.py index 7f0b13b6..8cc59d81 100644 --- a/scripts/show-capabilities-by-function.py +++ b/scripts/show-capabilities-by-function.py @@ -141,7 +141,7 @@ def main(argv=None): return -1 try: - rules = capa.rules.RuleSet(capa.main.get_rules(args.rules)) + rules = capa.main.get_rules(args.rules) logger.info("successfully loaded %s rules", len(rules)) if args.tag: rules = rules.filter_rules_by_meta(args.tag) From e644775ad1c69ccf6a45ff31f6c6d11b931cc1e4 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 14:52:47 +0100 Subject: [PATCH 03/31] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 96cbdcc3..d2bcf43e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ - dotnet: emit namespace/class features for type references #1242 @mike-hunhoff - dotnet: extract dotnet and pe format #1187 @mr-tz - don't render all library rule matches in vverbose output #1174 @mr-tz +- cache the rule set across invocations for better performance #1212 @williballenthin ### Breaking Changes - remove SMDA backend #1062 @williballenthin From e09d35bbb91b2e07136cb4cbbc10293248f5650e Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 15:01:05 +0100 Subject: [PATCH 04/31] main: fix rule content decoding --- capa/main.py | 2 +- capa/rules/__init__.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/capa/main.py b/capa/main.py index d88722a2..d6817f8b 100644 --- a/capa/main.py +++ b/capa/main.py @@ -620,7 +620,7 @@ def get_rules(rule_paths: List[str], disable_progress=False) -> RuleSet: for path, content in pbar(zip(rule_file_paths, rule_contents), desc="parsing ", unit=" rules"): try: - rule = capa.rules.Rule.from_yaml(content) + rule = capa.rules.Rule.from_yaml(content.decode("utf-8")) except capa.rules.InvalidRule: raise else: diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index ac344334..028f023a 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -743,7 +743,7 @@ class Rule: return self.statement.evaluate(features, short_circuit=short_circuit) @classmethod - def from_dict(cls, d, definition) -> "Rule": + def from_dict(cls, d: Dict[str, Any], definition: str) -> "Rule": meta = d["rule"]["meta"] name = meta["name"] # if scope is not specified, default to function scope. @@ -813,7 +813,7 @@ class Rule: return y @classmethod - def from_yaml(cls, s, use_ruamel=False) -> "Rule": + def from_yaml(cls, s: str, use_ruamel=False) -> "Rule": if use_ruamel: # ruamel enables nice formatting and doc roundtripping with comments doc = cls._get_ruamel_yaml_parser().load(s) From fbd7c566f40d3a2a9272ee93fac30ee36f8ef319 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 15:19:48 +0100 Subject: [PATCH 05/31] cache: add more helpers to enable better testing --- capa/rules/cache.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/capa/rules/cache.py b/capa/rules/cache.py index ed20ec66..7d4a49a9 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -85,15 +85,22 @@ class RuleCache: return cache -def cache_ruleset(ruleset: capa.rules.RuleSet): +def get_ruleset_content(ruleset: capa.rules.RuleSet) -> List[bytes]: rule_contents = [] for rule in ruleset.rules.values(): if rule.is_subscope_rule(): continue - with open(rule.meta["capa/path"], "rb") as f: - rule_contents.append(f.read()) + rule_contents.append(rule.definition.encode("utf-8")) + return rule_contents - id = compute_cache_identifier(rule_contents) + +def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdentifier: + rule_contents = get_ruleset_content(ruleset) + return compute_cache_identifier(rule_contents) + + +def cache_ruleset(ruleset: capa.rules.RuleSet): + id = compute_ruleset_cache_identifier(ruleset) path = get_default_cache_path(id) if os.path.exists(path): logger.debug("rule set already cached to %s", path) From 03f72f498e4572292614d1d1f2f98f984d79d9a5 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 15:20:10 +0100 Subject: [PATCH 06/31] cache: use zlib to reduce cache size --- capa/rules/cache.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 7d4a49a9..375f1c97 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -1,4 +1,5 @@ import sys +import zlib import pickle import hashlib import logging @@ -7,6 +8,7 @@ from typing import List, Optional from dataclasses import dataclass import capa.rules +import capa.version logger = logging.getLogger(__name__) @@ -70,14 +72,14 @@ class RuleCache: ruleset: capa.rules.RuleSet def dump(self): - return MAGIC + VERSION + self.id.encode("ascii") + pickle.dumps(self) + return MAGIC + VERSION + self.id.encode("ascii") + zlib.compress(pickle.dumps(self)) @staticmethod def load(data): assert data.startswith(MAGIC + VERSION) id = data[0x8:0x48].decode("ascii") - cache = pickle.loads(data[0x48:]) + cache = pickle.loads(zlib.decompress(data[0x48:])) assert isinstance(cache, RuleCache) assert cache.id == id From c423ccec67090742cac2394d7dc27ed42e748237 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 15:20:26 +0100 Subject: [PATCH 07/31] add tests for ruleset caching --- tests/test_rule_cache.py | 82 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 tests/test_rule_cache.py diff --git a/tests/test_rule_cache.py b/tests/test_rule_cache.py new file mode 100644 index 00000000..4c786dda --- /dev/null +++ b/tests/test_rule_cache.py @@ -0,0 +1,82 @@ +# Copyright (C) 2023 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import os +import textwrap + +import capa.rules +import capa.rules.cache + + +R1 = capa.rules.Rule.from_yaml(textwrap.dedent( + """ + rule: + meta: + name: test rule + authors: + - user@domain.com + scope: function + examples: + - foo1234 + - bar5678 + features: + - and: + - number: 1 + - number: 2 + """ +)) + +R2 = capa.rules.Rule.from_yaml(textwrap.dedent( + """ + rule: + meta: + name: test rule 2 + authors: + - user@domain.com + scope: function + examples: + - foo1234 + - bar5678 + features: + - and: + - number: 3 + - number: 4 + """ +)) + + +def test_ruleset_cache_ids(): + rs = capa.rules.RuleSet([R1]) + content = capa.rules.cache.get_ruleset_content(rs) + + rs2 = capa.rules.RuleSet([R1, R2]) + content2 = capa.rules.cache.get_ruleset_content(rs2) + + id = capa.rules.cache.compute_cache_identifier(content) + id2 = capa.rules.cache.compute_cache_identifier(content2) + assert id != id2 + + +def test_ruleset_cache_save_load(): + rs = capa.rules.RuleSet([R1]) + content = capa.rules.cache.get_ruleset_content(rs) + + id = capa.rules.cache.compute_cache_identifier(content) + assert id is not None + + path = capa.rules.cache.get_default_cache_path(id) + try: + os.remove(path) + except OSError: + pass + + capa.rules.cache.cache_ruleset(rs) + assert os.path.exists(path) + + rs = capa.rules.cache.load_cached_ruleset(content) + assert rs is not None From 0888e5ad69c4eee8b1a88b5e24e7ebcc6cba253d Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 15:22:43 +0100 Subject: [PATCH 08/31] main: more doc --- capa/main.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/capa/main.py b/capa/main.py index d6817f8b..e682407f 100644 --- a/capa/main.py +++ b/capa/main.py @@ -597,6 +597,13 @@ def collect_rule_file_paths(rule_paths: List[str]) -> List[str]: def get_rules(rule_paths: List[str], disable_progress=False) -> RuleSet: + """ + args: + rule_paths: list of paths to rules files or directories containing rules files + """ + + # rule_paths may contain directory paths, + # so search for file paths recursively. rule_file_paths = collect_rule_file_paths(rule_paths) # this list is parallel to `rule_file_paths`: From 99af09fce550aaef6991b77f8c83daa59185a0cd Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 15:24:34 +0100 Subject: [PATCH 09/31] main: revert wording change, which was just churn --- capa/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/main.py b/capa/main.py index e682407f..248d51df 100644 --- a/capa/main.py +++ b/capa/main.py @@ -625,7 +625,7 @@ def get_rules(rule_paths: List[str], disable_progress=False) -> RuleSet: # to disable progress completely pbar = lambda s, *args, **kwargs: s - for path, content in pbar(zip(rule_file_paths, rule_contents), desc="parsing ", unit=" rules"): + for path, content in pbar(zip(rule_file_paths, rule_contents), desc="loading ", unit=" rules"): try: rule = capa.rules.Rule.from_yaml(content.decode("utf-8")) except capa.rules.InvalidRule: @@ -636,7 +636,7 @@ def get_rules(rule_paths: List[str], disable_progress=False) -> RuleSet: rule.meta["capa/nursery"] = True rules.append(rule) - logger.debug("parsed rule: '%s' with scope: %s", rule.name, rule.scope) + logger.debug("loaded rule: '%s' with scope: %s", rule.name, rule.scope) ruleset = capa.rules.RuleSet(rules) From 946816e377e3621b7b001b78b4756d2d0b5c63b3 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 15:26:17 +0100 Subject: [PATCH 10/31] cache: improve variable name --- capa/rules/cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 375f1c97..b61ec8fd 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -27,7 +27,7 @@ def compute_cache_identifier(rule_content: List[bytes]) -> CacheIdentifier: hash.update(version.encode("utf-8")) hash.update(b"\x00") - rule_hashes = list(sorted([hashlib.sha256(rule).hexdigest() for rule in rule_content])) + rule_hashes = list(sorted([hashlib.sha256(buf).hexdigest() for buf in rule_content])) for rule_hash in rule_hashes: hash.update(rule_hash.encode("ascii")) hash.update(b"\x00") From f451fe68e15f00f45bcf9ed467d93268ecf5324f Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 15:42:22 +0100 Subject: [PATCH 11/31] pep8/mypy --- tests/test_rule_cache.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/test_rule_cache.py b/tests/test_rule_cache.py index 4c786dda..f652077c 100644 --- a/tests/test_rule_cache.py +++ b/tests/test_rule_cache.py @@ -12,9 +12,9 @@ import textwrap import capa.rules import capa.rules.cache - -R1 = capa.rules.Rule.from_yaml(textwrap.dedent( - """ +R1 = capa.rules.Rule.from_yaml( + textwrap.dedent( + """ rule: meta: name: test rule @@ -29,10 +29,12 @@ R1 = capa.rules.Rule.from_yaml(textwrap.dedent( - number: 1 - number: 2 """ -)) + ) +) -R2 = capa.rules.Rule.from_yaml(textwrap.dedent( - """ +R2 = capa.rules.Rule.from_yaml( + textwrap.dedent( + """ rule: meta: name: test rule 2 @@ -47,7 +49,8 @@ R2 = capa.rules.Rule.from_yaml(textwrap.dedent( - number: 3 - number: 4 """ -)) + ) +) def test_ruleset_cache_ids(): @@ -78,5 +81,4 @@ def test_ruleset_cache_save_load(): capa.rules.cache.cache_ruleset(rs) assert os.path.exists(path) - rs = capa.rules.cache.load_cached_ruleset(content) - assert rs is not None + assert capa.rules.cache.load_cached_ruleset(content) is not None From 56a0bedac9f21955710c07d1069bdf125b16963f Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 15:50:17 +0100 Subject: [PATCH 12/31] scripts: add tool to cache a ruleset to a directory --- scripts/cache-ruleset.py | 78 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 scripts/cache-ruleset.py diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py new file mode 100644 index 00000000..57635413 --- /dev/null +++ b/scripts/cache-ruleset.py @@ -0,0 +1,78 @@ +""" +Create a cache of the given rules. + +Usage: + + $ python scripts/cache-ruleset.py rules/ /path/to/cache/directory + +Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. +You may obtain a copy of the License at: [package root]/LICENSE.txt +Unless required by applicable law or agreed to in writing, software distributed under the License + is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and limitations under the License. +""" +import os +import sys +import time +import logging +import argparse + +import capa.main +import capa.rules +import capa.engine +import capa.helpers +import capa.rules.cache +import capa.features.insn + +logger = logging.getLogger("cache-ruleset") + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + parser = argparse.ArgumentParser(description="Cache ruleset.") + capa.main.install_common_args(parser) + parser.add_argument("rules", type=str, action="append", help="Path to rules") + parser.add_argument("cache", type=str, help="Path to cache directory") + args = parser.parse_args(args=argv) + capa.main.handle_common_args(args) + + if args.debug: + logging.getLogger("capa").setLevel(logging.DEBUG) + logging.getLogger("viv_utils").setLevel(logging.DEBUG) + else: + logging.getLogger("capa").setLevel(logging.ERROR) + logging.getLogger("viv_utils").setLevel(logging.ERROR) + + time0 = time.time() + + try: + rules = capa.main.get_rules(args.rules, disable_progress=True) + logger.info("successfully loaded %s rules", len(rules)) + except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: + logger.error("%s", str(e)) + return -1 + + content = capa.rules.cache.get_ruleset_content(rules) + id = capa.rules.cache.compute_cache_identifier(content) + path = capa.rules.cache.get_default_cache_path(id) + + assert os.path.exists(path) + with open(path, "rb") as f: + buf = f.read() + + cache_filename = os.path.basename(path) + cache_filepath = os.path.join(args.cache, cache_filename) + if os.path.exists(cache_filepath): + logger.info("cache file already exists: %s", cache_filepath) + return 0 + + with open(os.path.join(args.cache, cache_filename), "wb") as f: + f.write(buf) + + +if __name__ == "__main__": + sys.exit(main()) From a7afdec2e161011670492cc6d7ebb8c014d0adaf Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 16:10:41 +0100 Subject: [PATCH 13/31] cache: accept cache_dir parameter --- capa/main.py | 23 ++++++++++++++++++----- capa/rules/cache.py | 13 +++++++------ scripts/cache-ruleset.py | 19 ++++--------------- tests/test_rule_cache.py | 8 +++++--- 4 files changed, 34 insertions(+), 29 deletions(-) diff --git a/capa/main.py b/capa/main.py index 248d51df..1fdabfdb 100644 --- a/capa/main.py +++ b/capa/main.py @@ -415,7 +415,7 @@ def get_default_root() -> str: under PyInstaller, this comes from _MEIPASS. under source, this is the root directory of the project. """ - if is_running_standalone(): + if capa.helpers.is_running_standalone(): # pylance/mypy don't like `sys._MEIPASS` because this isn't standard. # its injected by pyinstaller. # so we'll fetch this attribute dynamically. @@ -596,11 +596,13 @@ def collect_rule_file_paths(rule_paths: List[str]) -> List[str]: return rule_file_paths -def get_rules(rule_paths: List[str], disable_progress=False) -> RuleSet: +def get_rules(rule_paths: List[str], disable_progress=False, cache_dir=None) -> RuleSet: """ args: rule_paths: list of paths to rules files or directories containing rules files """ + if cache_dir is None: + cache_dir = capa.rules.cache.get_default_cache_directory() # rule_paths may contain directory paths, # so search for file paths recursively. @@ -613,7 +615,7 @@ def get_rules(rule_paths: List[str], disable_progress=False) -> RuleSet: with open(file_path, "rb") as f: rule_contents.append(f.read()) - ruleset = capa.rules.cache.load_cached_ruleset(rule_contents) + ruleset = capa.rules.cache.load_cached_ruleset(cache_dir, rule_contents) if ruleset is not None: return ruleset @@ -640,7 +642,7 @@ def get_rules(rule_paths: List[str], disable_progress=False) -> RuleSet: ruleset = capa.rules.RuleSet(rules) - capa.rules.cache.cache_ruleset(ruleset) + capa.rules.cache.cache_ruleset(cache_dir, ruleset) return ruleset @@ -881,6 +883,9 @@ def handle_common_args(args): - rules: file system path to rule files. - signatures: file system path to signature files. + the following field may be added: + - is_default_rules: if the default rules were used. + args: args (argparse.Namespace): parsed arguments that included at least `install_common_args` args. """ @@ -940,6 +945,7 @@ def handle_common_args(args): return E_MISSING_RULES rules_paths.append(default_rule_path) + args.is_default_rules = True else: rules_paths = args.rules @@ -949,6 +955,8 @@ def handle_common_args(args): for rule_path in rules_paths: logger.debug("using rules path: %s", rule_path) + args.is_default_rules = False + args.rules = rules_paths if hasattr(args, "signatures"): @@ -1034,7 +1042,12 @@ def main(argv=None): return E_INVALID_FILE_TYPE try: - rules = get_rules(args.rules, disable_progress=args.quiet) + if is_running_standalone() and args.is_default_rules: + cache_dir = os.path.join(get_default_root(), "cache") + else: + cache_dir = capa.rules.cache.get_default_cache_directory() + + rules = get_rules(args.rules, disable_progress=args.quiet, cache_dir=cache_dir) logger.debug( "successfully loaded %s rules", diff --git a/capa/rules/cache.py b/capa/rules/cache.py index b61ec8fd..0cdc79ae 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -8,6 +8,7 @@ from typing import List, Optional from dataclasses import dataclass import capa.rules +import capa.helpers import capa.version logger = logging.getLogger(__name__) @@ -57,9 +58,9 @@ def get_default_cache_directory() -> str: return directory -def get_default_cache_path(id: CacheIdentifier) -> str: +def get_cache_path(cache_dir: str, id: CacheIdentifier) -> str: filename = "capa-" + id[:8] + ".cache" - return os.path.join(get_default_cache_directory(), filename) + return os.path.join(cache_dir, filename) MAGIC = b"capa" @@ -101,9 +102,9 @@ def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdenti return compute_cache_identifier(rule_contents) -def cache_ruleset(ruleset: capa.rules.RuleSet): +def cache_ruleset(cache_dir: str, ruleset: capa.rules.RuleSet): id = compute_ruleset_cache_identifier(ruleset) - path = get_default_cache_path(id) + path = get_cache_path(cache_dir, id) if os.path.exists(path): logger.debug("rule set already cached to %s", path) return @@ -116,9 +117,9 @@ def cache_ruleset(ruleset: capa.rules.RuleSet): return -def load_cached_ruleset(rule_contents: List[bytes]) -> Optional[capa.rules.RuleSet]: +def load_cached_ruleset(cache_dir: str, rule_contents: List[bytes]) -> Optional[capa.rules.RuleSet]: id = compute_cache_identifier(rule_contents) - path = get_default_cache_path(id) + path = get_cache_path(cache_dir, id) if not os.path.exists(path): logger.debug("rule set cache does not exist: %s", path) return None diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py index 57635413..44b9a983 100644 --- a/scripts/cache-ruleset.py +++ b/scripts/cache-ruleset.py @@ -47,10 +47,9 @@ def main(argv=None): logging.getLogger("capa").setLevel(logging.ERROR) logging.getLogger("viv_utils").setLevel(logging.ERROR) - time0 = time.time() - try: - rules = capa.main.get_rules(args.rules, disable_progress=True) + os.makedirs(args.cache, exist_ok=True) + rules = capa.main.get_rules(args.rules, disable_progress=True, cache_dir=args.cache) logger.info("successfully loaded %s rules", len(rules)) except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: logger.error("%s", str(e)) @@ -58,20 +57,10 @@ def main(argv=None): content = capa.rules.cache.get_ruleset_content(rules) id = capa.rules.cache.compute_cache_identifier(content) - path = capa.rules.cache.get_default_cache_path(id) + path = capa.rules.cache.get_cache_path(args.cache, id) assert os.path.exists(path) - with open(path, "rb") as f: - buf = f.read() - - cache_filename = os.path.basename(path) - cache_filepath = os.path.join(args.cache, cache_filename) - if os.path.exists(cache_filepath): - logger.info("cache file already exists: %s", cache_filepath) - return 0 - - with open(os.path.join(args.cache, cache_filename), "wb") as f: - f.write(buf) + logger.info("cached to: %s", path) if __name__ == "__main__": diff --git a/tests/test_rule_cache.py b/tests/test_rule_cache.py index f652077c..7b4265e4 100644 --- a/tests/test_rule_cache.py +++ b/tests/test_rule_cache.py @@ -72,13 +72,15 @@ def test_ruleset_cache_save_load(): id = capa.rules.cache.compute_cache_identifier(content) assert id is not None - path = capa.rules.cache.get_default_cache_path(id) + cache_dir = capa.rules.cache.get_default_cache_directory() + + path = capa.rules.cache.get_cache_path(cache_dir, id) try: os.remove(path) except OSError: pass - capa.rules.cache.cache_ruleset(rs) + capa.rules.cache.cache_ruleset(cache_dir, rs) assert os.path.exists(path) - assert capa.rules.cache.load_cached_ruleset(content) is not None + assert capa.rules.cache.load_cached_ruleset(cache_dir, content) is not None From b3b9ec11ddf14bb49fdc37d46c8f9338629e6017 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 16:11:00 +0100 Subject: [PATCH 14/31] pyinstaller: package up the cache directory, too --- .github/pyinstaller/pyinstaller.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/pyinstaller/pyinstaller.spec b/.github/pyinstaller/pyinstaller.spec index 8130684a..0e614055 100644 --- a/.github/pyinstaller/pyinstaller.spec +++ b/.github/pyinstaller/pyinstaller.spec @@ -19,6 +19,7 @@ a = Analysis( # i.e. ./.github/pyinstaller ("../../rules", "rules"), ("../../sigs", "sigs"), + ("../../cache", "cache"), # capa.render.default uses tabulate that depends on wcwidth. # it seems wcwidth uses a json file `version.json` # and this doesn't get picked up by pyinstaller automatically. From 33a46cc633c847585e5272e0b971024ed8cccb7e Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 16:19:46 +0100 Subject: [PATCH 15/31] ci: cache the ruleset --- .github/workflows/build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 52be6841..f45cb98a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -42,6 +42,8 @@ jobs: run: python -m pip install --upgrade pip setuptools - name: Install capa with build requirements run: pip install -e .[build] + - name: Cache the rule set + run: python ./scripts/cache-ruleset.py ./rules/ ./cache/ - name: Build standalone executable run: pyinstaller --log-level DEBUG .github/pyinstaller/pyinstaller.spec - name: Does it run (PE)? From 1a498d1afc6a8c5db998ba22e8735c8bd74b5850 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 20 Jan 2023 16:21:44 +0100 Subject: [PATCH 16/31] main: fix reference error --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index 1fdabfdb..45f6790d 100644 --- a/capa/main.py +++ b/capa/main.py @@ -415,7 +415,7 @@ def get_default_root() -> str: under PyInstaller, this comes from _MEIPASS. under source, this is the root directory of the project. """ - if capa.helpers.is_running_standalone(): + if is_running_standalone(): # pylance/mypy don't like `sys._MEIPASS` because this isn't standard. # its injected by pyinstaller. # so we'll fetch this attribute dynamically. From e503cedd8ffff67e1687977ff60b783dd5e3919b Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 21 Jan 2023 17:31:57 +0100 Subject: [PATCH 17/31] main: pbar: realize the list so it has a length --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index 45f6790d..dd0a76ef 100644 --- a/capa/main.py +++ b/capa/main.py @@ -627,7 +627,7 @@ def get_rules(rule_paths: List[str], disable_progress=False, cache_dir=None) -> # to disable progress completely pbar = lambda s, *args, **kwargs: s - for path, content in pbar(zip(rule_file_paths, rule_contents), desc="loading ", unit=" rules"): + for path, content in pbar(list(zip(rule_file_paths, rule_contents)), desc="loading ", unit=" rules"): try: rule = capa.rules.Rule.from_yaml(content.decode("utf-8")) except capa.rules.InvalidRule: From 6d16cafbc887ef6dcf14b79be85f4674aa567ba2 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 21 Jan 2023 18:14:12 +0100 Subject: [PATCH 18/31] cache: handle invalid caches --- capa/rules/cache.py | 8 ++++++++ tests/test_rule_cache.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 0cdc79ae..4c777e25 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -127,5 +127,13 @@ def load_cached_ruleset(cache_dir: str, rule_contents: List[bytes]) -> Optional[ logger.debug("loading rule set from cache: %s", path) with open(path, "rb") as f: buf = f.read() + + try: cache = RuleCache.load(buf) + except AssertionError: + logger.debug("rule set cache is invalid: %s", path) + # delete the cache that seems to be invalid. + os.remove(path) + return None + else: return cache.ruleset diff --git a/tests/test_rule_cache.py b/tests/test_rule_cache.py index 7b4265e4..fb11e5e7 100644 --- a/tests/test_rule_cache.py +++ b/tests/test_rule_cache.py @@ -84,3 +84,32 @@ def test_ruleset_cache_save_load(): assert os.path.exists(path) assert capa.rules.cache.load_cached_ruleset(cache_dir, content) is not None + + +def test_ruleset_cache_invalid(): + rs = capa.rules.RuleSet([R1]) + content = capa.rules.cache.get_ruleset_content(rs) + id = capa.rules.cache.compute_cache_identifier(content) + cache_dir = capa.rules.cache.get_default_cache_directory() + path = capa.rules.cache.get_cache_path(cache_dir, id) + try: + os.remove(path) + except OSError: + pass + + capa.rules.cache.cache_ruleset(cache_dir, rs) + assert os.path.exists(path) + + with open(path, "rb") as f: + buf = f.read() + + # corrupt the magic header + buf = b"x" + buf[1:] + + with open(path, "wb") as f: + f.write(buf) + + assert os.path.exists(path) + assert capa.rules.cache.load_cached_ruleset(cache_dir, content) is None + # the invalid cache should be deleted + assert not os.path.exists(path) From 6c83db99772dc276342d177980ecd25ebf710d05 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 23 Jan 2023 12:12:57 +0100 Subject: [PATCH 19/31] Update scripts/cache-ruleset.py Co-authored-by: Moritz --- scripts/cache-ruleset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py index 44b9a983..5f667beb 100644 --- a/scripts/cache-ruleset.py +++ b/scripts/cache-ruleset.py @@ -42,7 +42,6 @@ def main(argv=None): if args.debug: logging.getLogger("capa").setLevel(logging.DEBUG) - logging.getLogger("viv_utils").setLevel(logging.DEBUG) else: logging.getLogger("capa").setLevel(logging.ERROR) logging.getLogger("viv_utils").setLevel(logging.ERROR) From 68603a9cc790126dc586568c0fbf962cffe351a4 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 23 Jan 2023 12:13:07 +0100 Subject: [PATCH 20/31] Update scripts/cache-ruleset.py Co-authored-by: Moritz --- scripts/cache-ruleset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py index 5f667beb..dbacf8af 100644 --- a/scripts/cache-ruleset.py +++ b/scripts/cache-ruleset.py @@ -44,7 +44,6 @@ def main(argv=None): logging.getLogger("capa").setLevel(logging.DEBUG) else: logging.getLogger("capa").setLevel(logging.ERROR) - logging.getLogger("viv_utils").setLevel(logging.ERROR) try: os.makedirs(args.cache, exist_ok=True) From e30dd08dec2f3099ea66bf189a1a77651d4e7fa3 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 21 Jan 2023 18:20:14 +0100 Subject: [PATCH 21/31] cache: add doc --- capa/rules/cache.py | 16 ++++++++++++++++ scripts/cache-ruleset.py | 2 ++ 2 files changed, 18 insertions(+) diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 4c777e25..e2bdfc70 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -103,6 +103,14 @@ def compute_ruleset_cache_identifier(ruleset: capa.rules.RuleSet) -> CacheIdenti def cache_ruleset(cache_dir: str, ruleset: capa.rules.RuleSet): + """ + cache the given ruleset to disk, using the given cache directory. + this can subsequently be reloaded via `load_cached_ruleset`, + assuming the capa version and rule content does not change. + + callers should use this function to avoid the performance overhead + of validating rules on each run. + """ id = compute_ruleset_cache_identifier(ruleset) path = get_cache_path(cache_dir, id) if os.path.exists(path): @@ -118,6 +126,14 @@ def cache_ruleset(cache_dir: str, ruleset: capa.rules.RuleSet): def load_cached_ruleset(cache_dir: str, rule_contents: List[bytes]) -> Optional[capa.rules.RuleSet]: + """ + load a cached ruleset from disk, using the given cache directory. + the raw rule contents are required here to prove that the rules haven't changed + and to avoid stale cache entries. + + callers should use this function to avoid the performance overhead + of validating rules on each run. + """ id = compute_cache_identifier(rule_contents) path = get_cache_path(cache_dir, id) if not os.path.exists(path): diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py index dbacf8af..af849512 100644 --- a/scripts/cache-ruleset.py +++ b/scripts/cache-ruleset.py @@ -1,5 +1,7 @@ """ Create a cache of the given rules. +This is only really intended to be used by CI to pre-cache rulesets +that will be distributed within PyInstaller binaries. Usage: From 887f37b72c1f42fca9410dfce160536c3f2b1ef3 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 21 Jan 2023 19:10:02 +0100 Subject: [PATCH 22/31] main: get_rules: accept callback to update status --- capa/main.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/capa/main.py b/capa/main.py index dd0a76ef..2fb928a0 100644 --- a/capa/main.py +++ b/capa/main.py @@ -20,7 +20,7 @@ import textwrap import itertools import contextlib import collections -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Tuple, Callable import halo import tqdm @@ -596,7 +596,20 @@ def collect_rule_file_paths(rule_paths: List[str]) -> List[str]: return rule_file_paths -def get_rules(rule_paths: List[str], disable_progress=False, cache_dir=None) -> RuleSet: +# TypeAlias. note: using `foo: TypeAlias = bar` is Python 3.10+ +RulePath = str + + +def on_load_rule_default(_path: RulePath, i: int, _total: int) -> None: + return + + +def get_rules( + rule_paths: List[RulePath], + disable_progress=False, + cache_dir=None, + on_load_rule: Callable[[RulePath, int, int], None] = on_load_rule_default +) -> RuleSet: """ args: rule_paths: list of paths to rules files or directories containing rules files @@ -627,7 +640,10 @@ def get_rules(rule_paths: List[str], disable_progress=False, cache_dir=None) -> # to disable progress completely pbar = lambda s, *args, **kwargs: s - for path, content in pbar(list(zip(rule_file_paths, rule_contents)), desc="loading ", unit=" rules"): + total_rule_count = len(rule_file_paths) + for i, path, content in pbar(list(enumerate(zip(rule_file_paths, rule_contents))), desc="loading ", unit=" rules"): + on_load_rule(path, i, total_rule_count) + try: rule = capa.rules.Rule.from_yaml(content.decode("utf-8")) except capa.rules.InvalidRule: From 3c0e36d5d416da1853235217185dc37883710b3b Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 21 Jan 2023 19:10:35 +0100 Subject: [PATCH 23/31] ruleset: record number of source rules loaded --- capa/rules/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 028f023a..7d98e25a 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1081,6 +1081,14 @@ class RuleSet: ensure_rules_are_unique(rules) + # in the next step we extract subscope rules, + # which may inflate the number of rules tracked in this ruleset. + # so record number of rules initially provided to this ruleset. + # + # this number is really only meaningful to the user, + # who may compare it against the number of files on their file system. + self.source_rule_count = len(rules) + rules = self._extract_subscope_rules(rules) ensure_rule_dependencies_are_met(rules) From f152729c791818d264a3fd267bc1e4df29bb886f Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 21 Jan 2023 19:10:50 +0100 Subject: [PATCH 24/31] explorer: use main.get_rules and simplify cache --- capa/ida/plugin/cache.py | 10 +--- capa/ida/plugin/form.py | 113 +++++++++++++-------------------------- 2 files changed, 39 insertions(+), 84 deletions(-) diff --git a/capa/ida/plugin/cache.py b/capa/ida/plugin/cache.py index 4444d302..eec67734 100644 --- a/capa/ida/plugin/cache.py +++ b/capa/ida/plugin/cache.py @@ -8,26 +8,18 @@ from __future__ import annotations -import copy import itertools import collections from typing import Set, Dict, List, Tuple, Union, Optional import capa.engine -from capa.rules import Rule, Scope, RuleSet +from capa.rules import Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.features.address import NO_ADDRESS, Address from capa.ida.plugin.extractor import CapaExplorerFeatureExtractor from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle -class CapaExplorerRuleSetCache: - def __init__(self, rules: List[Rule]): - # capa.rules.Ruleset modifies rules, so we use deepcopy to preserve the original list of rules and our cached list of rules - self.rules: List[Rule] = copy.deepcopy(rules) - self.ruleset: RuleSet = RuleSet(copy.deepcopy(self.rules)) - - class CapaRuleGenFeatureCacheNode: def __init__( self, diff --git a/capa/ida/plugin/form.py b/capa/ida/plugin/form.py index 175eb550..450e5c7b 100644 --- a/capa/ida/plugin/form.py +++ b/capa/ida/plugin/form.py @@ -27,7 +27,7 @@ import capa.render.json import capa.features.common import capa.render.result_document import capa.features.extractors.ida.extractor -from capa.rules import Rule +from capa.rules import Rule, RuleSet from capa.engine import FeatureSet from capa.ida.plugin.icon import QICON from capa.ida.plugin.view import ( @@ -36,7 +36,7 @@ from capa.ida.plugin.view import ( CapaExplorerRulegenPreview, CapaExplorerRulegenFeatures, ) -from capa.ida.plugin.cache import CapaRuleGenFeatureCache, CapaExplorerRuleSetCache +from capa.ida.plugin.cache import CapaRuleGenFeatureCache from capa.ida.plugin.error import UserCancelledError from capa.ida.plugin.hooks import CapaExplorerIdaHooks from capa.ida.plugin.model import CapaExplorerDataModel @@ -160,7 +160,7 @@ class CapaExplorerForm(idaapi.PluginForm): # caches used to speed up capa explorer analysis - these must be init to None self.resdoc_cache: Optional[capa.render.result_document.ResultDocument] = None - self.ruleset_cache: Optional[CapaExplorerRuleSetCache] = None + self.ruleset_cache: Optional[capa.rules.RuleSet] = None self.rulegen_feature_cache: Optional[CapaRuleGenFeatureCache] = None self.rulegen_current_function: Optional[FunctionHandle] = None @@ -525,8 +525,7 @@ class CapaExplorerForm(idaapi.PluginForm): meta["prev_base"] = idaapi.get_imagebase() self.model_data.reset() - def load_capa_rules(self): - """load capa rules from directory specified by user, either using IDA UI or settings""" + def ensure_capa_settings_rule_path(self): try: # resolve rules directory - check self and settings first, then ask user if not os.path.exists(settings.user.get(CAPA_SETTINGS_RULE_PATH, "")): @@ -567,56 +566,26 @@ class CapaExplorerForm(idaapi.PluginForm): logger.info("User cancelled analysis.") return False + if not os.path.exists(path): + logger.error("rule path %s does not exist or cannot be accessed" % path) + return False + + return True + + def load_capa_rules(self): + """load capa rules from directory specified by user, either using IDA UI or settings""" + if not self.ensure_capa_settings_rule_path(): + return False + rule_path: str = settings.user.get(CAPA_SETTINGS_RULE_PATH, "") try: - # following code is derived from capa.main.get_rules, we dup it here so we can inject code that allows - # user to cancel analysis from IDA UI - if not os.path.exists(rule_path): - raise IOError("rule path %s does not exist or cannot be accessed" % rule_path) - rule_paths: List[str] = [] - if os.path.isfile(rule_path): - rule_paths.append(rule_path) - elif os.path.isdir(rule_path): - for root, dirs, files in os.walk(rule_path): - if ".git" in root: - # the .github directory contains CI config in capa-rules - # this includes some .yml files - # these are not rules - # additionally, .git has files that are not .yml and generate the warning - # skip those too - continue - for file in files: - if not file.endswith(".yml"): - if not (file.startswith(".git") or file.endswith((".git", ".md", ".txt"))): - # expect to see .git* files, readme.md, format.md, and maybe a .git directory - # other things maybe are rules, but are mis-named. - logger.warning("skipping non-.yml file: %s", file) - continue - rule_path = os.path.join(root, file) - rule_paths.append(rule_path) - - rules: List[Rule] = [] - total_paths: int = len(rule_paths) - for (i, rule_path) in enumerate(rule_paths): - update_wait_box( - "loading capa rules from %s (%d of %d)" - % (settings.user[CAPA_SETTINGS_RULE_PATH], i + 1, total_paths) - ) + def on_load_rule(rule_path, i, total): + update_wait_box("loading capa rules from %s (%d of %d)" % (rule_path, i, total)) if ida_kernwin.user_cancelled(): raise UserCancelledError("user cancelled") - try: - rule = capa.rules.Rule.from_yaml_file(rule_path) - except capa.rules.InvalidRule: - raise - else: - rule.meta["capa/path"] = rule_path - if capa.main.is_nursery_rule_path(rule_path): - rule.meta["capa/nursery"] = True - rules.append(rule) - # cache rules and rule set - self.ruleset_cache = CapaExplorerRuleSetCache(rules) + self.ruleset_cache = capa.main.get_rules([rule_path], disable_progress=True, on_load_rule=on_load_rule) except UserCancelledError: logger.info("User cancelled analysis.") return False @@ -686,6 +655,10 @@ class CapaExplorerForm(idaapi.PluginForm): # function should handle exceptions and return False if not self.load_capa_rules(): return False + assert self.ruleset_cache is not None + # matching operations may update rule instances, + # so we'll work with a local copy of the ruleset. + ruleset = copy.deepcopy(self.ruleset_cache) if ida_kernwin.user_cancelled(): logger.info("User cancelled analysis.") @@ -694,17 +667,10 @@ class CapaExplorerForm(idaapi.PluginForm): update_wait_box("extracting features") try: - # just generated above - assert self.ruleset_cache is not None - meta = capa.ida.helpers.collect_metadata([settings.user[CAPA_SETTINGS_RULE_PATH]]) - capabilities, counts = capa.main.find_capabilities( - self.ruleset_cache.ruleset, extractor, disable_progress=True - ) + capabilities, counts = capa.main.find_capabilities(ruleset, extractor, disable_progress=True) meta["analysis"].update(counts) - meta["analysis"]["layout"] = capa.main.compute_layout( - self.ruleset_cache.ruleset, extractor, capabilities - ) + meta["analysis"]["layout"] = capa.main.compute_layout(ruleset, extractor, capabilities) except UserCancelledError: logger.info("User cancelled analysis.") return False @@ -735,7 +701,7 @@ class CapaExplorerForm(idaapi.PluginForm): capa.ida.helpers.inform_user_ida_ui("capa encountered file type warnings during analysis") - if capa.main.has_file_limitation(self.ruleset_cache.ruleset, capabilities, is_standalone=False): + if capa.main.has_file_limitation(ruleset, capabilities, is_standalone=False): capa.ida.helpers.inform_user_ida_ui("capa encountered file limitation warnings during analysis") except Exception as e: logger.error("Failed to check for file limitations (error: %s)", e, exc_info=True) @@ -748,9 +714,7 @@ class CapaExplorerForm(idaapi.PluginForm): update_wait_box("rendering results") try: - self.resdoc_cache = capa.render.result_document.ResultDocument.from_capa( - meta, self.ruleset_cache.ruleset, capabilities - ) + self.resdoc_cache = capa.render.result_document.ResultDocument.from_capa(meta, ruleset, capabilities) except Exception as e: logger.error("Failed to collect results (error: %s)", e, exc_info=True) return False @@ -759,12 +723,10 @@ class CapaExplorerForm(idaapi.PluginForm): # either the results are cached and the doc already exists, # or the doc was just created above assert self.resdoc_cache is not None - # same with rules cache, either it's cached or it was just loaded - assert self.ruleset_cache is not None self.model_data.render_capa_doc(self.resdoc_cache, self.view_show_results_by_function.isChecked()) self.set_view_status_label( - "capa rules: %s (%d rules)" % (settings.user[CAPA_SETTINGS_RULE_PATH], len(self.ruleset_cache.rules)) + "capa rules: %s (%d rules)" % (settings.user[CAPA_SETTINGS_RULE_PATH], ruleset.source_rule_count) ) except Exception as e: logger.error("Failed to render results (error: %s)", e, exc_info=True) @@ -809,6 +771,11 @@ class CapaExplorerForm(idaapi.PluginForm): else: logger.info('Using cached capa rules, click "Reset" to load rules from disk.') + assert self.ruleset_cache is not None + # matching operations may update rule instances, + # so we'll work with a local copy of the ruleset. + ruleset = copy.deepcopy(self.ruleset_cache) + # clear feature cache if self.rulegen_feature_cache is not None: self.rulegen_feature_cache = None @@ -868,18 +835,16 @@ class CapaExplorerForm(idaapi.PluginForm): all_function_features: FeatureSet = collections.defaultdict(set) try: - assert self.ruleset_cache is not None - if self.rulegen_current_function is not None: _, func_matches, bb_matches, insn_matches = self.rulegen_feature_cache.find_code_capabilities( - self.ruleset_cache.ruleset, self.rulegen_current_function + ruleset, self.rulegen_current_function ) all_function_features.update( self.rulegen_feature_cache.get_all_function_features(self.rulegen_current_function) ) for (name, result) in itertools.chain(func_matches.items(), bb_matches.items(), insn_matches.items()): - rule = self.ruleset_cache.ruleset[name] + rule = ruleset[name] if rule.is_subscope_rule(): continue for (addr, _) in result: @@ -896,13 +861,11 @@ class CapaExplorerForm(idaapi.PluginForm): all_file_features: FeatureSet = collections.defaultdict(set) try: - assert self.ruleset_cache is not None - - _, file_matches = self.rulegen_feature_cache.find_file_capabilities(self.ruleset_cache.ruleset) + _, file_matches = self.rulegen_feature_cache.find_file_capabilities(ruleset) all_file_features.update(self.rulegen_feature_cache.get_all_file_features()) for (name, result) in file_matches.items(): - rule = self.ruleset_cache.ruleset[name] + rule = ruleset[name] if rule.is_subscope_rule(): continue for (addr, _) in result: @@ -928,7 +891,7 @@ class CapaExplorerForm(idaapi.PluginForm): self.view_rulegen_features.load_features(all_file_features, all_function_features) self.set_view_status_label( - "capa rules: %s (%d rules)" % (settings.user[CAPA_SETTINGS_RULE_PATH], len(self.ruleset_cache.rules)) + "capa rules: %s (%d rules)" % (settings.user[CAPA_SETTINGS_RULE_PATH], ruleset.source_rule_count) ) except Exception as e: logger.error("Failed to render views (error: %s)", e, exc_info=True) @@ -1037,7 +1000,7 @@ class CapaExplorerForm(idaapi.PluginForm): return # we must create a deep copy of rules because any rule matching operations modify the original rule - rules = copy.deepcopy(self.ruleset_cache.rules) + rules = copy.deepcopy([r for r in self.ruleset_cache.rules.values() if not r.is_subscope_rule()]) rules.append(rule) try: From a076a0c44e88a3a5e832286a74813e3682e19f63 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 21 Jan 2023 19:24:20 +0100 Subject: [PATCH 25/31] main: further document get_rules --- capa/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/capa/main.py b/capa/main.py index 2fb928a0..3cf69f64 100644 --- a/capa/main.py +++ b/capa/main.py @@ -613,6 +613,9 @@ def get_rules( """ args: rule_paths: list of paths to rules files or directories containing rules files + disable_progress: disable progress bar + cache_dir: directory to use for caching rules, or will use the default detected cache directory if None + on_load_rule: callback to invoke before a rule is loaded, use for progress or cancellation """ if cache_dir is None: cache_dir = capa.rules.cache.get_default_cache_directory() From 67b9d2e1c08bc72ca551a35f2789947683c01b10 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 21 Jan 2023 19:28:15 +0100 Subject: [PATCH 26/31] black --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index 3cf69f64..4dcbb989 100644 --- a/capa/main.py +++ b/capa/main.py @@ -608,7 +608,7 @@ def get_rules( rule_paths: List[RulePath], disable_progress=False, cache_dir=None, - on_load_rule: Callable[[RulePath, int, int], None] = on_load_rule_default + on_load_rule: Callable[[RulePath, int, int], None] = on_load_rule_default, ) -> RuleSet: """ args: From 3dfd16c0331cd9fd0b4a14b3aa13e11fbfed0946 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 21 Jan 2023 19:30:15 +0100 Subject: [PATCH 27/31] main: fix ValueError --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index 4dcbb989..f42bf0f7 100644 --- a/capa/main.py +++ b/capa/main.py @@ -644,7 +644,7 @@ def get_rules( pbar = lambda s, *args, **kwargs: s total_rule_count = len(rule_file_paths) - for i, path, content in pbar(list(enumerate(zip(rule_file_paths, rule_contents))), desc="loading ", unit=" rules"): + for i, (path, content) in pbar(list(enumerate(zip(rule_file_paths, rule_contents))), desc="loading ", unit=" rules"): on_load_rule(path, i, total_rule_count) try: From 67cfefd2dfc525d7e93b07cfe66e977763d6262c Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 21 Jan 2023 19:38:23 +0100 Subject: [PATCH 28/31] main: get_rules: remove progress bar --- capa/ida/plugin/form.py | 2 +- capa/main.py | 12 ++---------- scripts/cache-ruleset.py | 2 +- scripts/capa2yara.py | 2 +- scripts/capa_as_library.py | 2 +- scripts/lint.py | 2 +- scripts/profile-time.py | 2 +- 7 files changed, 8 insertions(+), 16 deletions(-) diff --git a/capa/ida/plugin/form.py b/capa/ida/plugin/form.py index 450e5c7b..fc93ff3b 100644 --- a/capa/ida/plugin/form.py +++ b/capa/ida/plugin/form.py @@ -585,7 +585,7 @@ class CapaExplorerForm(idaapi.PluginForm): if ida_kernwin.user_cancelled(): raise UserCancelledError("user cancelled") - self.ruleset_cache = capa.main.get_rules([rule_path], disable_progress=True, on_load_rule=on_load_rule) + self.ruleset_cache = capa.main.get_rules([rule_path], on_load_rule=on_load_rule) except UserCancelledError: logger.info("User cancelled analysis.") return False diff --git a/capa/main.py b/capa/main.py index f42bf0f7..81f7e47d 100644 --- a/capa/main.py +++ b/capa/main.py @@ -606,14 +606,12 @@ def on_load_rule_default(_path: RulePath, i: int, _total: int) -> None: def get_rules( rule_paths: List[RulePath], - disable_progress=False, cache_dir=None, on_load_rule: Callable[[RulePath, int, int], None] = on_load_rule_default, ) -> RuleSet: """ args: rule_paths: list of paths to rules files or directories containing rules files - disable_progress: disable progress bar cache_dir: directory to use for caching rules, or will use the default detected cache directory if None on_load_rule: callback to invoke before a rule is loaded, use for progress or cancellation """ @@ -637,14 +635,8 @@ def get_rules( rules = [] # type: List[Rule] - pbar = tqdm.tqdm - if disable_progress: - # do not use tqdm to avoid unnecessary side effects when caller intends - # to disable progress completely - pbar = lambda s, *args, **kwargs: s - total_rule_count = len(rule_file_paths) - for i, (path, content) in pbar(list(enumerate(zip(rule_file_paths, rule_contents))), desc="loading ", unit=" rules"): + for i, (path, content) in enumerate(zip(rule_file_paths, rule_contents)): on_load_rule(path, i, total_rule_count) try: @@ -1066,7 +1058,7 @@ def main(argv=None): else: cache_dir = capa.rules.cache.get_default_cache_directory() - rules = get_rules(args.rules, disable_progress=args.quiet, cache_dir=cache_dir) + rules = get_rules(args.rules, cache_dir=cache_dir) logger.debug( "successfully loaded %s rules", diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py index af849512..a2a49bdb 100644 --- a/scripts/cache-ruleset.py +++ b/scripts/cache-ruleset.py @@ -49,7 +49,7 @@ def main(argv=None): try: os.makedirs(args.cache, exist_ok=True) - rules = capa.main.get_rules(args.rules, disable_progress=True, cache_dir=args.cache) + rules = capa.main.get_rules(args.rules, cache_dir=args.cache) logger.info("successfully loaded %s rules", len(rules)) except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: logger.error("%s", str(e)) diff --git a/scripts/capa2yara.py b/scripts/capa2yara.py index 97bc1f8a..89b4a41d 100644 --- a/scripts/capa2yara.py +++ b/scripts/capa2yara.py @@ -709,7 +709,7 @@ def main(argv=None): logging.getLogger("capa2yara").setLevel(level) try: - rules = capa.main.get_rules([args.rules], disable_progress=True) + rules = capa.main.get_rules(args.rules) namespaces = capa.rules.index_rules_by_namespace(list(rules.rules.values())) logger.info("successfully loaded %s rules (including subscope rules which will be ignored)", len(rules)) if args.tag: diff --git a/scripts/capa_as_library.py b/scripts/capa_as_library.py index 04d4a307..1122969b 100644 --- a/scripts/capa_as_library.py +++ b/scripts/capa_as_library.py @@ -161,7 +161,7 @@ def render_dictionary(doc: rd.ResultDocument) -> Dict[str, Any]: # ==== render dictionary helpers def capa_details(rules_path, file_path, output_format="dictionary"): # load rules from disk - rules = capa.main.get_rules([rules_path], disable_progress=True) + rules = capa.main.get_rules([rules_path]) # extract features and find capabilities extractor = capa.main.get_extractor(file_path, "auto", capa.main.BACKEND_VIV, [], False, disable_progress=True) diff --git a/scripts/lint.py b/scripts/lint.py index c6639807..77740307 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -998,7 +998,7 @@ def main(argv=None): time0 = time.time() try: - rules = capa.main.get_rules(args.rules, disable_progress=True) + rules = capa.main.get_rules(args.rules) logger.info("successfully loaded %s rules", len(rules)) if args.tag: rules = rules.filter_rules_by_meta(args.tag) diff --git a/scripts/profile-time.py b/scripts/profile-time.py index 0b76902d..73caabb9 100644 --- a/scripts/profile-time.py +++ b/scripts/profile-time.py @@ -88,7 +88,7 @@ def main(argv=None): try: with capa.main.timing("load rules"): - rules = capa.main.get_rules(args.rules, disable_progress=True) + rules = capa.main.get_rules(args.rules) except (IOError) as e: logger.error("%s", str(e)) return -1 From 61112c2527e4dbe693c14ac6c6c593627479f09b Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Sat, 21 Jan 2023 20:16:49 +0100 Subject: [PATCH 29/31] lint: fix pbar counts --- scripts/lint.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/scripts/lint.py b/scripts/lint.py index 77740307..2c0fa739 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -917,12 +917,11 @@ def lint(ctx: Context): """ ret = {} - with tqdm.contrib.logging.tqdm_logging_redirect(ctx.rules.rules.items(), unit="rule") as pbar: + source_rules = [rule for rule in ctx.rules.rules.values() if not rule.is_subscope_rule()] + with tqdm.contrib.logging.tqdm_logging_redirect(source_rules, unit="rule") as pbar: with redirecting_print_to_tqdm(): - for name, rule in pbar: - if rule.is_subscope_rule(): - continue - + for rule in pbar: + name = rule.name pbar.set_description(width("linting rule: %s" % (name), 48)) ret[name] = lint_rule(ctx, rule) @@ -999,7 +998,7 @@ def main(argv=None): try: rules = capa.main.get_rules(args.rules) - logger.info("successfully loaded %s rules", len(rules)) + logger.info("successfully loaded %s rules", rules.source_rule_count) if args.tag: rules = rules.filter_rules_by_meta(args.tag) logger.debug("selected %s rules", len(rules)) From 83eefd343c4090123b4955da5130ac45fa312f6e Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 24 Jan 2023 15:33:37 +0100 Subject: [PATCH 30/31] Update scripts/capa2yara.py Co-authored-by: Moritz --- scripts/capa2yara.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/capa2yara.py b/scripts/capa2yara.py index 89b4a41d..8e477a6b 100644 --- a/scripts/capa2yara.py +++ b/scripts/capa2yara.py @@ -709,7 +709,7 @@ def main(argv=None): logging.getLogger("capa2yara").setLevel(level) try: - rules = capa.main.get_rules(args.rules) + rules = capa.main.get_rules([args.rules]) namespaces = capa.rules.index_rules_by_namespace(list(rules.rules.values())) logger.info("successfully loaded %s rules (including subscope rules which will be ignored)", len(rules)) if args.tag: From e5549d6ce8e5881a8943ede9d53190f3acc3b9d2 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 25 Jan 2023 16:47:01 +0100 Subject: [PATCH 31/31] Update capa/ida/plugin/form.py --- capa/ida/plugin/form.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/capa/ida/plugin/form.py b/capa/ida/plugin/form.py index fc93ff3b..ee1faa3e 100644 --- a/capa/ida/plugin/form.py +++ b/capa/ida/plugin/form.py @@ -1000,6 +1000,8 @@ class CapaExplorerForm(idaapi.PluginForm): return # we must create a deep copy of rules because any rule matching operations modify the original rule + # the ruleset may derive subscope rules from the source rules loaded from disk. + # by ignoring them, we reconstruct the collection of rules provided by the user. rules = copy.deepcopy([r for r in self.ruleset_cache.rules.values() if not r.is_subscope_rule()]) rules.append(rule)