strings: fix type hints and uncovered bugs (#2555)

* strings: fix type hints and uncovered bugs changelog add strings tests strings: fix buf_filled_with fix strings tests refactor: optimize and document buf_filled_with function in strings.py docs: add docstring to buf_filled_with function doc strings: add typing * strings: more validation and testing thanks @fariss * copyright
2025-12-05 20:40:05 -08:00 · 2025-01-16 01:59:16 -07:00
parent 3eef829410
commit 72fe291742
3 changed files with 166 additions and 21 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@
 - vmray: load more analysis archives @mr-tz
 - dynamic: only check file limitations for static file formats @mr-tz
 - vmray: skip non-printable strings @mike-hunhoff
 - strings: add type hints and fix uncovered bugs @williballenthin @2555
 - elffile: handle symbols without a name @williballenthin #2553
 ### capa Explorer Web
--- a/capa/features/extractors/strings.py
+++ b/capa/features/extractors/strings.py
@@ -14,47 +14,85 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
 import string
 import contextlib
-from collections import namedtuple
+from dataclasses import dataclass
 from collections.abc import Iterator
 ASCII_BYTE = r" !\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t".encode(
    "ascii"
 )
 ASCII_RE_4 = re.compile(b"([%s]{%d,})" % (ASCII_BYTE, 4))
 UNICODE_RE_4 = re.compile(b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, 4))
-REPEATS = [b"A", b"\x00", b"\xfe", b"\xff"]
+REPEATS = {ord("A"), 0x00, 0xFE, 0xFF}
 SLICE_SIZE = 4096
 PRINTABLE_CHAR_SET = set(string.printable)
-String = namedtuple("String", ["s", "offset"])
+
@dataclass
 class String:
    s: str
    offset: int
-def buf_filled_with(buf, character):
+def buf_filled_with(buf: bytes, character: int) -> bool:
-    dupe_chunk = character * SLICE_SIZE
+    """Check if the given buffer is filled with the given character, repeatedly.
    Args:
        buf: The bytes buffer to check
        character: The byte value (0-255) to check for
    Returns:
        True if all bytes in the buffer match the character, False otherwise.
        The empty buffer contains no bytes, therefore always returns False.
    """
    if not buf:
        return False
    if not (0 <= character <= 255):
        raise ValueError(f"Character value {character} outside valid byte range (0-255)")
    if len(buf) < SLICE_SIZE:
        return all(b == character for b in buf)
    # single big allocation, re-used each loop
    dupe_chunk = bytes(character) * SLICE_SIZE
    for offset in range(0, len(buf), SLICE_SIZE):
-        new_chunk = buf[offset : offset + SLICE_SIZE]
+        # bytes objects are immutable, so the slices share the underlying array,
-        if dupe_chunk[: len(new_chunk)] != new_chunk:
+        # and therefore this is cheap.
-            return False
+        current_chunk = buf[offset : offset + SLICE_SIZE]
        if len(current_chunk) == SLICE_SIZE:
            # chunk-aligned comparison
            if dupe_chunk != current_chunk:
                return False
        else:
            # last loop, final chunk size is not aligned
            if not all(b == character for b in current_chunk):
                return False
    return True
-def extract_ascii_strings(buf, n=4):
+def extract_ascii_strings(buf: bytes, n: int = 4) -> Iterator[String]:
    """
    Extract ASCII strings from the given binary data.
-    :param buf: A bytestring.
+    Params:
-    :type buf: str
+      buf: the bytes from which to extract strings
-    :param n: The minimum length of strings to extract.
+      n: minimum string length
    :type n: int
    :rtype: Sequence[String]
    """
    if not buf:
        return
    if n < 1:
        raise ValueError("minimum string length must be positive")
    if (buf[0] in REPEATS) and buf_filled_with(buf, buf[0]):
        return
@@ -68,20 +106,21 @@ def extract_ascii_strings(buf, n=4):
        yield String(match.group().decode("ascii"), match.start())
-def extract_unicode_strings(buf, n=4):
+def extract_unicode_strings(buf: bytes, n: int = 4) -> Iterator[String]:
    """
    Extract naive UTF-16 strings from the given binary data.
-    :param buf: A bytestring.
+    Params:
-    :type buf: str
+      buf: the bytes from which to extract strings
-    :param n: The minimum length of strings to extract.
+      n: minimum string length
    :type n: int
    :rtype: Sequence[String]
    """
    if not buf:
        return
    if n < 1:
        raise ValueError("minimum string length must be positive")
    if (buf[0] in REPEATS) and buf_filled_with(buf, buf[0]):
        return
--- a/tests/test_strings.py
+++ b/tests/test_strings.py
@@ -0,0 +1,105 @@
 # Copyright 2025 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from capa.features.extractors.strings import (
    String,
    buf_filled_with,
    is_printable_str,
    extract_ascii_strings,
    extract_unicode_strings,
 )
 def test_buf_filled_with():
    # single repeating byte
    assert buf_filled_with(b"\x00" * 8, 0x00) is True
    assert buf_filled_with(b"\xFF" * 8, 0xFF) is True
    # mixed bytes
    assert buf_filled_with(b"\x00\x01" * 8, 0x00) is False
    assert buf_filled_with(b"ABCD" * 8, ord("A")) is False
    # edge cases
    assert buf_filled_with(b"", 0x00) is False  # Empty buffer
    assert buf_filled_with(b"\x00", 0x00) is True  # Single byte
 def test_extract_ascii_strings():
    # test empty buffer
    assert list(extract_ascii_strings(b"")) == []
    buf = b"Hello World\x00This is a test\x00"
    strings = list(extract_ascii_strings(buf))
    assert len(strings) == 2
    assert strings[0] == String("Hello World", 0)
    assert strings[1] == String("This is a test", 12)
    # min length
    buf = b"Hi\x00Test\x00"
    strings = list(extract_ascii_strings(buf, n=4))
    assert len(strings) == 1
    assert strings[0] == String("Test", 3)
    # non-ASCII strings
    buf = b"Hello\xFFWorld\x00"
    strings = list(extract_ascii_strings(buf))
    assert len(strings) == 2
    assert strings[0] == String("Hello", 0)
    assert strings[1] == String("World", 6)
    # only non-ASCII
    assert list(extract_ascii_strings(b"\xFF\xFF\xFF")) == []
    buf = b"\x00" * 8 + b"ValidString\x00"
    strings = list(extract_ascii_strings(buf))
    assert len(strings) == 1
    assert strings[0] == String("ValidString", 8)
 def test_extract_unicode_strings():
    buf = b"H\x00e\x00l\x00l\x00o\x00\x00\x00"
    strings = list(extract_unicode_strings(buf))
    assert len(strings) == 1
    assert strings[0] == String("Hello", 0)
    # min length
    buf = b"H\x00i\x00\x00\x00T\x00e\x00s\x00t\x00\x00\x00"
    strings = list(extract_unicode_strings(buf, n=4))
    assert len(strings) == 1
    assert strings[0] == String("Test", 6)
    # invalid Unicode sequences
    buf = b"H\x00\xFF\x00l\x00l\x00o\x00\x00\x00"
    strings = list(extract_unicode_strings(buf))
    assert len(strings) == 0
    # repeating bytes (should be skipped)
    buf = b"\x00" * 8 + b"V\x00a\x00l\x00i\x00d\x00\x00\x00"
    strings = list(extract_unicode_strings(buf))
    assert len(strings) == 1
    assert strings[0] == String("Valid", 8)
 def test_is_printable_str():
    assert is_printable_str("Hello World") is True
    assert is_printable_str("123!@#") is True
    assert is_printable_str("\t\n\r") is True  # whitespace is printable
    assert is_printable_str("\x00\x01\x02") is False
    assert is_printable_str("Hello\x07World") is False
    assert is_printable_str("\x1b[31m") is False  # ANSI escape codes
    assert is_printable_str("") is True  # empty string
    assert is_printable_str(" ") is True  # single space
    assert is_printable_str("\x7f") is False  # DEL character