features: extract all strings matching regex

closes #159
2026-02-04 11:07:53 -08:00 · 2021-05-27 10:27:39 -06:00
parent ed02088c82
commit 868b5ed6a3
4 changed files with 69 additions and 31 deletions
--- a/capa/features/init.py
+++ b/capa/features/init.py
@@ -9,6 +9,7 @@
 import re
 import codecs
 import logging
+import collections

 import capa.engine
 import capa.features
@@ -155,6 +156,10 @@ class Regex(String):
            )

    def evaluate(self, ctx):
+        # mapping from string value to list of locations.
+        # will unique the locations later on.
+        matches = collections.defaultdict(list)
+
        for feature, locations in ctx.items():
            if not isinstance(feature, (capa.features.String,)):
                continue
@@ -164,13 +169,26 @@ class Regex(String):
            # using this mode cleans is more convenient for rule authors,
            # so that they don't have to prefix/suffix their terms like: /.*foo.*/.
            if self.re.search(feature.value):
-                # unlike other features, we cannot return put a reference to `self` directly in a `Result`.
-                # this is because `self` may match on many strings, so we can't stuff the matched value into it.
-                # instead, return a new instance that has a reference to both the regex and the matched value.
-                # see #262.
-                return capa.engine.Result(True, _MatchedRegex(self, feature.value), [], locations=locations)
+                matches[feature.value].extend(locations)

-        return capa.engine.Result(False, _MatchedRegex(self, None), [])
+        if matches:
+            # finalize: defaultdict -> dict
+            # which makes json serialization easier
+            matches = dict(matches)
+
+            # collect all locations
+            locations = set()
+            for s in matches.keys():
+                matches[s] = list(set(matches[s]))
+                locations.update(matches[s])
+
+            # unlike other features, we cannot return put a reference to `self` directly in a `Result`.
+            # this is because `self` may match on many strings, so we can't stuff the matched value into it.
+            # instead, return a new instance that has a reference to both the regex and the matched values.
+            # see #262.
+            return capa.engine.Result(True, _MatchedRegex(self, matches), [], locations=locations)
+        else:
+            return capa.engine.Result(False, _MatchedRegex(self, None), [])

    def __str__(self):
        return "regex(string =~ %s)" % self.value
@@ -178,27 +196,27 @@ class Regex(String):

 class _MatchedRegex(Regex):
    """
-    this represents a specific instance of a regular expression feature match.
-    treat it the same as a `Regex` except it has the `match` field that contains the complete string that matched.
+    this represents specific match instances of a regular expression feature.
+    treat it the same as a `Regex` except it has the `matches` field that contains the complete strings that matched.

    note: this type should only ever be constructed by `Regex.evaluate()`. it is not part of the public API.
    """

-    def __init__(self, regex, match):
+    def __init__(self, regex, matches):
        """
        args:
-          regex (Regex): the regex feature that matches
-          match (string|None): the matching string or None if it doesn't match
+          regex (Regex): the regex feature that matches.
+          match (Dict[string, List[int]]|None): mapping from matching string to its locations.
        """
        super(_MatchedRegex, self).__init__(regex.value, description=regex.description)
        # we want this to collide with the name of `Regex` above,
        # so that it works nicely with the renderers.
        self.name = "regex"
        # this may be None if the regex doesn't match
-        self.match = match
+        self.matches = matches

    def __str__(self):
-        return 'regex(string =~ %s, matched = "%s")' % (self.value, self.match)
+        return 'regex(string =~ %s, matches = %s)' % (self.value, ", ".join(map(lambda s: '"' + s + '"', (self.matches or {}).keys())))


 class StringFactory(object):
--- a/capa/ida/plugin/model.py
+++ b/capa/ida/plugin/model.py
@@ -12,6 +12,8 @@ import idc
 import idaapi
 from PyQt5 import QtGui, QtCore

+import capa.rules
+import capa.features
 import capa.ida.helpers
 import capa.render.utils as rutils
 from capa.ida.plugin.item import (
@@ -556,7 +558,7 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):

        if feature["type"] == "regex":
            return CapaExplorerStringViewItem(
-                parent, display, location, '"%s"' % capa.features.escape_string(feature["match"])
+                parent, display, location, "\n".join(map(lambda s: '"' + capa.features.escape_string(s) + '"', feature["matches"].keys()))
            )

        if feature["type"] == "basicblock":
--- a/capa/render/init.py
+++ b/capa/render/init.py
@@ -72,7 +72,7 @@ def convert_feature_to_result_document(feature):
    if feature.description:
        result["description"] = feature.description
    if feature.name == "regex":
-        result["match"] = feature.match
+        result["matches"] = feature.matches
    return result


--- a/capa/render/vverbose.py
+++ b/capa/render/vverbose.py
@@ -6,10 +6,9 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.

-import collections
-
 import tabulate

+import capa.features
 import capa.rules
 import capa.render.utils as rutils
 import capa.render.verbose
@@ -85,31 +84,50 @@ def render_statement(ostream, match, statement, indent=0):
        raise RuntimeError("unexpected match statement type: " + str(statement))


+def render_string_value(s):
+    return '"%s"' % capa.features.escape_string(s)
+
+
 def render_feature(ostream, match, feature, indent=0):
    ostream.write("  " * indent)

    key = feature["type"]
    value = feature[feature["type"]]
-    if key == "regex":
-        key = "string"  # render string for regex to mirror the rule source
-        value = feature["match"]  # the match provides more information than the value for regex

-    if key == "string":
-        value = '"%s"' % capa.features.escape_string(value)
+    if key != "regex":
+        # like:
+        #   number: 10 = SOME_CONSTANT @ 0x401000
+        if key == "string":
+            value = render_string_value(value)

-    ostream.write(key)
-    ostream.write(": ")
+        ostream.write(key)
+        ostream.write(": ")

-    if value:
-        ostream.write(rutils.bold2(value))
+        if value:
+            ostream.write(rutils.bold2(value))

-        if "description" in feature:
-            ostream.write(capa.rules.DESCRIPTION_SEPARATOR)
-            ostream.write(feature["description"])
+            if "description" in feature:
+                ostream.write(capa.rules.DESCRIPTION_SEPARATOR)
+                ostream.write(feature["description"])

-    render_locations(ostream, match)
-    ostream.write("\n")
+        render_locations(ostream, match)
+        ostream.write("\n")
+    else:
+        # like:
+        #  regex: /blah/ = SOME_CONSTANT
+        #    - "foo blah baz" @ 0x401000
+        #    - "aaa blah bbb" @ 0x402000, 0x403400
+        ostream.write(key)
+        ostream.write(": ")
+        ostream.write(value)
+        ostream.write("\n")

+        for match, locations in sorted(feature["matches"].items(), key=lambda p: p[0]):
+            ostream.write("  " * (indent + 1))
+            ostream.write("- ")
+            ostream.write(rutils.bold2(render_string_value(match)))
+            render_locations(ostream, {"locations": locations})
+            ostream.write("\n")

 def render_node(ostream, match, node, indent=0):
    if node["type"] == "statement":