strings: fix type hints and uncovered bugs (#2555)

* strings: fix type hints and uncovered bugs changelog add strings tests strings: fix buf_filled_with fix strings tests refactor: optimize and document buf_filled_with function in strings.py docs: add docstring to buf_filled_with function doc strings: add typing * strings: more validation and testing thanks @fariss * copyright
2025-12-05 20:40:05 -08:00 · 2025-01-16 01:59:16 -07:00
parent 3eef829410
commit 72fe291742
3 changed files with 166 additions and 21 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@
 - vmray: load more analysis archives @mr-tz
 - dynamic: only check file limitations for static file formats @mr-tz
 - vmray: skip non-printable strings @mike-hunhoff
+- strings: add type hints and fix uncovered bugs @williballenthin @2555
 - elffile: handle symbols without a name @williballenthin #2553

 ### capa Explorer Web
--- a/capa/features/extractors/strings.py
+++ b/capa/features/extractors/strings.py
@@ -14,47 +14,85 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
 import re
 import string
 import contextlib
-from collections import namedtuple
+from dataclasses import dataclass
+from collections.abc import Iterator

 ASCII_BYTE = r" !\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t".encode(
    "ascii"
 )
 ASCII_RE_4 = re.compile(b"([%s]{%d,})" % (ASCII_BYTE, 4))
 UNICODE_RE_4 = re.compile(b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, 4))
-REPEATS = [b"A", b"\x00", b"\xfe", b"\xff"]
+REPEATS = {ord("A"), 0x00, 0xFE, 0xFF}
 SLICE_SIZE = 4096
 PRINTABLE_CHAR_SET = set(string.printable)

-String = namedtuple("String", ["s", "offset"])
+
+@dataclass
+class String:
+    s: str
+    offset: int


-def buf_filled_with(buf, character):
-    dupe_chunk = character * SLICE_SIZE
-    for offset in range(0, len(buf), SLICE_SIZE):
-        new_chunk = buf[offset : offset + SLICE_SIZE]
-        if dupe_chunk[: len(new_chunk)] != new_chunk:
+def buf_filled_with(buf: bytes, character: int) -> bool:
+    """Check if the given buffer is filled with the given character, repeatedly.
+
+    Args:
+        buf: The bytes buffer to check
+        character: The byte value (0-255) to check for
+
+    Returns:
+        True if all bytes in the buffer match the character, False otherwise.
+        The empty buffer contains no bytes, therefore always returns False.
+    """
+    if not buf:
        return False
+
+    if not (0 <= character <= 255):
+        raise ValueError(f"Character value {character} outside valid byte range (0-255)")
+
+    if len(buf) < SLICE_SIZE:
+        return all(b == character for b in buf)
+
+    # single big allocation, re-used each loop
+    dupe_chunk = bytes(character) * SLICE_SIZE
+
+    for offset in range(0, len(buf), SLICE_SIZE):
+        # bytes objects are immutable, so the slices share the underlying array,
+        # and therefore this is cheap.
+        current_chunk = buf[offset : offset + SLICE_SIZE]
+
+        if len(current_chunk) == SLICE_SIZE:
+            # chunk-aligned comparison
+
+            if dupe_chunk != current_chunk:
+                return False
+
+        else:
+            # last loop, final chunk size is not aligned
+            if not all(b == character for b in current_chunk):
+                return False
+
    return True


-def extract_ascii_strings(buf, n=4):
+def extract_ascii_strings(buf: bytes, n: int = 4) -> Iterator[String]:
    """
    Extract ASCII strings from the given binary data.

-    :param buf: A bytestring.
-    :type buf: str
-    :param n: The minimum length of strings to extract.
-    :type n: int
-    :rtype: Sequence[String]
+    Params:
+      buf: the bytes from which to extract strings
+      n: minimum string length
    """

    if not buf:
        return

+    if n < 1:
+        raise ValueError("minimum string length must be positive")
+
    if (buf[0] in REPEATS) and buf_filled_with(buf, buf[0]):
        return

@@ -68,20 +106,21 @@ def extract_ascii_strings(buf, n=4):
        yield String(match.group().decode("ascii"), match.start())


-def extract_unicode_strings(buf, n=4):
+def extract_unicode_strings(buf: bytes, n: int = 4) -> Iterator[String]:
    """
    Extract naive UTF-16 strings from the given binary data.

-    :param buf: A bytestring.
-    :type buf: str
-    :param n: The minimum length of strings to extract.
-    :type n: int
-    :rtype: Sequence[String]
+    Params:
+      buf: the bytes from which to extract strings
+      n: minimum string length
    """

    if not buf:
        return

+    if n < 1:
+        raise ValueError("minimum string length must be positive")
+
    if (buf[0] in REPEATS) and buf_filled_with(buf, buf[0]):
        return

--- a/tests/test_strings.py
+++ b/tests/test_strings.py
@@ -0,0 +1,105 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from capa.features.extractors.strings import (
+    String,
+    buf_filled_with,
+    is_printable_str,
+    extract_ascii_strings,
+    extract_unicode_strings,
+)
+
+
+def test_buf_filled_with():
+    # single repeating byte
+    assert buf_filled_with(b"\x00" * 8, 0x00) is True
+    assert buf_filled_with(b"\xFF" * 8, 0xFF) is True
+
+    # mixed bytes
+    assert buf_filled_with(b"\x00\x01" * 8, 0x00) is False
+    assert buf_filled_with(b"ABCD" * 8, ord("A")) is False
+
+    # edge cases
+    assert buf_filled_with(b"", 0x00) is False  # Empty buffer
+    assert buf_filled_with(b"\x00", 0x00) is True  # Single byte
+
+
+def test_extract_ascii_strings():
+    # test empty buffer
+    assert list(extract_ascii_strings(b"")) == []
+
+    buf = b"Hello World\x00This is a test\x00"
+    strings = list(extract_ascii_strings(buf))
+    assert len(strings) == 2
+    assert strings[0] == String("Hello World", 0)
+    assert strings[1] == String("This is a test", 12)
+
+    # min length
+    buf = b"Hi\x00Test\x00"
+    strings = list(extract_ascii_strings(buf, n=4))
+    assert len(strings) == 1
+    assert strings[0] == String("Test", 3)
+
+    # non-ASCII strings
+    buf = b"Hello\xFFWorld\x00"
+    strings = list(extract_ascii_strings(buf))
+    assert len(strings) == 2
+    assert strings[0] == String("Hello", 0)
+    assert strings[1] == String("World", 6)
+
+    # only non-ASCII
+    assert list(extract_ascii_strings(b"\xFF\xFF\xFF")) == []
+
+    buf = b"\x00" * 8 + b"ValidString\x00"
+    strings = list(extract_ascii_strings(buf))
+    assert len(strings) == 1
+    assert strings[0] == String("ValidString", 8)
+
+
+def test_extract_unicode_strings():
+    buf = b"H\x00e\x00l\x00l\x00o\x00\x00\x00"
+    strings = list(extract_unicode_strings(buf))
+    assert len(strings) == 1
+    assert strings[0] == String("Hello", 0)
+
+    # min length
+    buf = b"H\x00i\x00\x00\x00T\x00e\x00s\x00t\x00\x00\x00"
+    strings = list(extract_unicode_strings(buf, n=4))
+    assert len(strings) == 1
+    assert strings[0] == String("Test", 6)
+
+    # invalid Unicode sequences
+    buf = b"H\x00\xFF\x00l\x00l\x00o\x00\x00\x00"
+    strings = list(extract_unicode_strings(buf))
+    assert len(strings) == 0
+
+    # repeating bytes (should be skipped)
+    buf = b"\x00" * 8 + b"V\x00a\x00l\x00i\x00d\x00\x00\x00"
+    strings = list(extract_unicode_strings(buf))
+    assert len(strings) == 1
+    assert strings[0] == String("Valid", 8)
+
+
+def test_is_printable_str():
+    assert is_printable_str("Hello World") is True
+    assert is_printable_str("123!@#") is True
+    assert is_printable_str("\t\n\r") is True  # whitespace is printable
+
+    assert is_printable_str("\x00\x01\x02") is False
+    assert is_printable_str("Hello\x07World") is False
+    assert is_printable_str("\x1b[31m") is False  # ANSI escape codes
+
+    assert is_printable_str("") is True  # empty string
+    assert is_printable_str(" ") is True  # single space
+    assert is_printable_str("\x7f") is False  # DEL character