diff --git a/CHANGELOG.md b/CHANGELOG.md index a65a5136..d09ee2ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - rule format: add feature `format: ` for file format, like `format: pe` #723 @williballenthin - rule format: add feature `arch: ` for architecture, like `arch: amd64` #723 @williballenthin - rule format: add feature `os: ` for operating system, like `os: windows` #723 @williballenthin +- rule format: add feature `substring: ` for verbatim strings with leading/trailing wildcards #737 @williballenthin ### Breaking Changes diff --git a/capa/features/common.py b/capa/features/common.py index 8f16d6e5..1515275d 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -135,6 +135,77 @@ class String(Feature): super(String, self).__init__(value, description=description) +class Substring(String): + def __init__(self, value: str, description=None): + super(Substring, self).__init__(value, description=description) + self.value = value + + def evaluate(self, ctx): + # mapping from string value to list of locations. + # will unique the locations later on. + matches = collections.defaultdict(list) + + for feature, locations in ctx.items(): + if not isinstance(feature, (String,)): + continue + + if not isinstance(feature.value, str): + # this is a programming error: String should only contain str + raise ValueError("unexpected feature value type") + + if self.value in feature.value: + matches[feature.value].extend(locations) + + if matches: + # finalize: defaultdict -> dict + # which makes json serialization easier + matches = dict(matches) + + # collect all locations + locations = set() + for s in matches.keys(): + matches[s] = list(set(matches[s])) + locations.update(matches[s]) + + # unlike other features, we cannot return put a reference to `self` directly in a `Result`. + # this is because `self` may match on many strings, so we can't stuff the matched value into it. + # instead, return a new instance that has a reference to both the substring and the matched values. + return capa.engine.Result(True, _MatchedSubstring(self, matches), [], locations=locations) + else: + return capa.engine.Result(False, _MatchedSubstring(self, None), []) + + def __str__(self): + return "substring(%s)" % self.value + + +class _MatchedSubstring(Substring): + """ + this represents specific match instances of a substring feature. + treat it the same as a `Substring` except it has the `matches` field that contains the complete strings that matched. + + note: this type should only ever be constructed by `Substring.evaluate()`. it is not part of the public API. + """ + + def __init__(self, substring: Substring, matches): + """ + args: + substring (Substring): the substring feature that matches. + match (Dict[string, List[int]]|None): mapping from matching string to its locations. + """ + super(_MatchedSubstring, self).__init__(str(substring.value), description=substring.description) + # we want this to collide with the name of `Substring` above, + # so that it works nicely with the renderers. + self.name = "substring" + # this may be None if the substring doesn't match + self.matches = matches + + def __str__(self): + return 'substring("%s", matches = %s)' % ( + self.value, + ", ".join(map(lambda s: '"' + s + '"', (self.matches or {}).keys())), + ) + + class Regex(String): def __init__(self, value: str, description=None): super(Regex, self).__init__(value, description=description) diff --git a/capa/ida/plugin/model.py b/capa/ida/plugin/model.py index 96d415ce..d18f78cb 100644 --- a/capa/ida/plugin/model.py +++ b/capa/ida/plugin/model.py @@ -562,7 +562,7 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): parent, display, source=doc["rules"].get(feature[feature["type"]], {}).get("source", "") ) - if feature["type"] == "regex": + if feature["type"] in ("regex", "substring"): for s, locations in feature["matches"].items(): if location in locations: return CapaExplorerStringViewItem( diff --git a/capa/render/result_document.py b/capa/render/result_document.py index 1fe556b8..b2de7299 100644 --- a/capa/render/result_document.py +++ b/capa/render/result_document.py @@ -74,7 +74,7 @@ def convert_feature_to_result_document(feature): result = {"type": feature.name, feature.name: feature.get_value_str()} if feature.description: result["description"] = feature.description - if feature.name == "regex": + if feature.name in ("regex", "substring"): result["matches"] = feature.matches return result diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index adecde36..9662283a 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -97,7 +97,7 @@ def render_feature(ostream, match, feature, indent=0): key = feature["type"] value = feature[feature["type"]] - if key != "regex": + if key not in ("regex", "substring"): # like: # number: 10 = SOME_CONSTANT @ 0x401000 if key == "string": diff --git a/capa/rules.py b/capa/rules.py index b8b90faa..99595835 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -219,6 +219,8 @@ def parse_feature(key: str): return capa.features.insn.API elif key == "string": return capa.features.common.StringFactory + elif key == "substring": + return capa.features.common.Substring elif key == "bytes": return capa.features.common.Bytes elif key == "number": diff --git a/scripts/lint.py b/scripts/lint.py index d62d764e..4e467e3f 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -326,7 +326,7 @@ class FeatureStringTooShort(Lint): def check_features(self, ctx, features): for feature in features: - if isinstance(feature, capa.features.common.String): + if isinstance(feature, (capa.features.common.String, capa.features.common.Substring)): if len(feature.value) < 4: self.recommendation = self.recommendation.format(feature.value) return True @@ -415,23 +415,40 @@ class FormatStringQuotesIncorrect(Lint): def check_rule(self, ctx, rule): events = capa.rules.Rule._get_ruamel_yaml_parser().parse(rule.definition) for key in events: - if not (isinstance(key, ruamel.yaml.ScalarEvent) and key.value == "string"): + if isinstance(key, ruamel.yaml.ScalarEvent) and key.value == "string": + value = next(events) # assume value is next event + if not isinstance(value, ruamel.yaml.ScalarEvent): + # ignore non-scalar + continue + if value.value.startswith("/") and value.value.endswith(("/", "/i")): + # ignore regex for now + continue + if value.style is None: + # no quotes + self.recommendation = 'add double quotes to "%s"' % value.value + return True + if value.style == "'": + # single quote + self.recommendation = 'change single quotes to double quotes for "%s"' % value.value + return True + + elif isinstance(key, ruamel.yaml.ScalarEvent) and key.value == "substring": + value = next(events) # assume value is next event + if not isinstance(value, ruamel.yaml.ScalarEvent): + # ignore non-scalar + continue + if value.style is None: + # no quotes + self.recommendation = 'add double quotes to "%s"' % value.value + return True + if value.style == "'": + # single quote + self.recommendation = 'change single quotes to double quotes for "%s"' % value.value + return True + + else: continue - value = next(events) # assume value is next event - if not isinstance(value, ruamel.yaml.ScalarEvent): - # ignore non-scalar - continue - if value.value.startswith("/") and value.value.endswith(("/", "/i")): - # ignore regex for now - continue - if value.style is None: - # no quotes - self.recommendation = 'add double quotes to "%s"' % value.value - return True - if value.style == "'": - # single quote - self.recommendation = 'change single quotes to double quotes for "%s"' % value.value - return True + return False diff --git a/scripts/show-features.py b/scripts/show-features.py index 6d69ea7f..96a04ce2 100644 --- a/scripts/show-features.py +++ b/scripts/show-features.py @@ -203,6 +203,8 @@ def print_features(functions, extractor): logger.debug("skipping library function 0x%x (%s)", function_address, function_name) continue + print("func: 0x%08x" % (function_address)) + for feature, va in extractor.extract_function_features(f): if capa.features.common.is_global_feature(feature): continue diff --git a/tests/fixtures.py b/tests/fixtures.py index b47e79e2..cf1937da 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -464,9 +464,10 @@ FEATURE_PRESENCE_TESTS = sorted( ("mimikatz", "function=0x40105D", capa.features.common.String("ACR > "), True), ("mimikatz", "function=0x40105D", capa.features.common.String("nope"), False), ("773290...", "function=0x140001140", capa.features.common.String(r"%s:\\OfficePackagesForWDAG"), True), - # insn/regex, issue #262 + # insn/regex ("pma16-01", "function=0x4021B0", capa.features.common.Regex("HTTP/1.0"), True), - ("pma16-01", "function=0x4021B0", capa.features.common.Regex("www.practicalmalwareanalysis.com"), False), + ("pma16-01", "function=0x402F40", capa.features.common.Regex("www.practicalmalwareanalysis.com"), True), + ("pma16-01", "function=0x402F40", capa.features.common.Substring("practicalmalwareanalysis.com"), True), # insn/string, pointer to string ("mimikatz", "function=0x44EDEF", capa.features.common.String("INPUTEVENT"), True), # insn/string, direct memory reference diff --git a/tests/test_engine.py b/tests/test_engine.py index 57fffb8e..ce421759 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -284,6 +284,57 @@ def test_match_matched_rules(): assert capa.features.common.MatchedRule("test rule2") in features +def test_substring(): + rules = [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + features: + - and: + - substring: abc + """ + ) + ), + ] + features, matches = capa.engine.match( + capa.rules.topologically_order_rules(rules), + {capa.features.common.String("aaaa"): {1}}, + 0x0, + ) + assert capa.features.common.MatchedRule("test rule") not in features + + features, matches = capa.engine.match( + capa.rules.topologically_order_rules(rules), + {capa.features.common.String("abc"): {1}}, + 0x0, + ) + assert capa.features.common.MatchedRule("test rule") in features + + features, matches = capa.engine.match( + capa.rules.topologically_order_rules(rules), + {capa.features.common.String("111abc222"): {1}}, + 0x0, + ) + assert capa.features.common.MatchedRule("test rule") in features + + features, matches = capa.engine.match( + capa.rules.topologically_order_rules(rules), + {capa.features.common.String("111abc"): {1}}, + 0x0, + ) + assert capa.features.common.MatchedRule("test rule") in features + + features, matches = capa.engine.match( + capa.rules.topologically_order_rules(rules), + {capa.features.common.String("abc222"): {1}}, + 0x0, + ) + assert capa.features.common.MatchedRule("test rule") in features + + def test_regex(): rules = [ capa.rules.Rule.from_yaml( diff --git a/tests/test_rules.py b/tests/test_rules.py index 9362ef91..12263659 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -28,6 +28,7 @@ from capa.features.common import ( Arch, Format, String, + Substring, ) @@ -747,6 +748,43 @@ def test_string_values_special_characters(): assert (String("bye\nbye") in children) == True +def test_substring_feature(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + features: + - or: + - substring: abc + - substring: "def" + - substring: "gh\\ni" + """ + ) + r = capa.rules.Rule.from_yaml(rule) + children = list(r.statement.get_children()) + assert (Substring("abc") in children) == True + assert (Substring("def") in children) == True + assert (Substring("gh\ni") in children) == True + + +def test_substring_description(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + features: + - or: + - substring: abc + description: the start of the alphabet + """ + ) + r = capa.rules.Rule.from_yaml(rule) + children = list(r.statement.get_children()) + assert (Substring("abc") in children) == True + + def test_regex_values_always_string(): rules = [ capa.rules.Rule.from_yaml(