mirror of
https://github.com/mandiant/capa.git
synced 2025-12-05 20:40:05 -08:00
strings: fix type hints and uncovered bugs (#2555)
* strings: fix type hints and uncovered bugs changelog add strings tests strings: fix buf_filled_with fix strings tests refactor: optimize and document buf_filled_with function in strings.py docs: add docstring to buf_filled_with function doc strings: add typing * strings: more validation and testing thanks @fariss * copyright
This commit is contained in:
@@ -15,6 +15,7 @@
|
|||||||
- vmray: load more analysis archives @mr-tz
|
- vmray: load more analysis archives @mr-tz
|
||||||
- dynamic: only check file limitations for static file formats @mr-tz
|
- dynamic: only check file limitations for static file formats @mr-tz
|
||||||
- vmray: skip non-printable strings @mike-hunhoff
|
- vmray: skip non-printable strings @mike-hunhoff
|
||||||
|
- strings: add type hints and fix uncovered bugs @williballenthin @2555
|
||||||
- elffile: handle symbols without a name @williballenthin #2553
|
- elffile: handle symbols without a name @williballenthin #2553
|
||||||
|
|
||||||
### capa Explorer Web
|
### capa Explorer Web
|
||||||
|
|||||||
@@ -14,47 +14,85 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import string
|
import string
|
||||||
import contextlib
|
import contextlib
|
||||||
from collections import namedtuple
|
from dataclasses import dataclass
|
||||||
|
from collections.abc import Iterator
|
||||||
|
|
||||||
ASCII_BYTE = r" !\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t".encode(
|
ASCII_BYTE = r" !\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t".encode(
|
||||||
"ascii"
|
"ascii"
|
||||||
)
|
)
|
||||||
ASCII_RE_4 = re.compile(b"([%s]{%d,})" % (ASCII_BYTE, 4))
|
ASCII_RE_4 = re.compile(b"([%s]{%d,})" % (ASCII_BYTE, 4))
|
||||||
UNICODE_RE_4 = re.compile(b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, 4))
|
UNICODE_RE_4 = re.compile(b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, 4))
|
||||||
REPEATS = [b"A", b"\x00", b"\xfe", b"\xff"]
|
REPEATS = {ord("A"), 0x00, 0xFE, 0xFF}
|
||||||
SLICE_SIZE = 4096
|
SLICE_SIZE = 4096
|
||||||
PRINTABLE_CHAR_SET = set(string.printable)
|
PRINTABLE_CHAR_SET = set(string.printable)
|
||||||
|
|
||||||
String = namedtuple("String", ["s", "offset"])
|
|
||||||
|
@dataclass
|
||||||
|
class String:
|
||||||
|
s: str
|
||||||
|
offset: int
|
||||||
|
|
||||||
|
|
||||||
def buf_filled_with(buf, character):
|
def buf_filled_with(buf: bytes, character: int) -> bool:
|
||||||
dupe_chunk = character * SLICE_SIZE
|
"""Check if the given buffer is filled with the given character, repeatedly.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
buf: The bytes buffer to check
|
||||||
|
character: The byte value (0-255) to check for
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if all bytes in the buffer match the character, False otherwise.
|
||||||
|
The empty buffer contains no bytes, therefore always returns False.
|
||||||
|
"""
|
||||||
|
if not buf:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not (0 <= character <= 255):
|
||||||
|
raise ValueError(f"Character value {character} outside valid byte range (0-255)")
|
||||||
|
|
||||||
|
if len(buf) < SLICE_SIZE:
|
||||||
|
return all(b == character for b in buf)
|
||||||
|
|
||||||
|
# single big allocation, re-used each loop
|
||||||
|
dupe_chunk = bytes(character) * SLICE_SIZE
|
||||||
|
|
||||||
for offset in range(0, len(buf), SLICE_SIZE):
|
for offset in range(0, len(buf), SLICE_SIZE):
|
||||||
new_chunk = buf[offset : offset + SLICE_SIZE]
|
# bytes objects are immutable, so the slices share the underlying array,
|
||||||
if dupe_chunk[: len(new_chunk)] != new_chunk:
|
# and therefore this is cheap.
|
||||||
return False
|
current_chunk = buf[offset : offset + SLICE_SIZE]
|
||||||
|
|
||||||
|
if len(current_chunk) == SLICE_SIZE:
|
||||||
|
# chunk-aligned comparison
|
||||||
|
|
||||||
|
if dupe_chunk != current_chunk:
|
||||||
|
return False
|
||||||
|
|
||||||
|
else:
|
||||||
|
# last loop, final chunk size is not aligned
|
||||||
|
if not all(b == character for b in current_chunk):
|
||||||
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def extract_ascii_strings(buf, n=4):
|
def extract_ascii_strings(buf: bytes, n: int = 4) -> Iterator[String]:
|
||||||
"""
|
"""
|
||||||
Extract ASCII strings from the given binary data.
|
Extract ASCII strings from the given binary data.
|
||||||
|
|
||||||
:param buf: A bytestring.
|
Params:
|
||||||
:type buf: str
|
buf: the bytes from which to extract strings
|
||||||
:param n: The minimum length of strings to extract.
|
n: minimum string length
|
||||||
:type n: int
|
|
||||||
:rtype: Sequence[String]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not buf:
|
if not buf:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if n < 1:
|
||||||
|
raise ValueError("minimum string length must be positive")
|
||||||
|
|
||||||
if (buf[0] in REPEATS) and buf_filled_with(buf, buf[0]):
|
if (buf[0] in REPEATS) and buf_filled_with(buf, buf[0]):
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -68,20 +106,21 @@ def extract_ascii_strings(buf, n=4):
|
|||||||
yield String(match.group().decode("ascii"), match.start())
|
yield String(match.group().decode("ascii"), match.start())
|
||||||
|
|
||||||
|
|
||||||
def extract_unicode_strings(buf, n=4):
|
def extract_unicode_strings(buf: bytes, n: int = 4) -> Iterator[String]:
|
||||||
"""
|
"""
|
||||||
Extract naive UTF-16 strings from the given binary data.
|
Extract naive UTF-16 strings from the given binary data.
|
||||||
|
|
||||||
:param buf: A bytestring.
|
Params:
|
||||||
:type buf: str
|
buf: the bytes from which to extract strings
|
||||||
:param n: The minimum length of strings to extract.
|
n: minimum string length
|
||||||
:type n: int
|
|
||||||
:rtype: Sequence[String]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not buf:
|
if not buf:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if n < 1:
|
||||||
|
raise ValueError("minimum string length must be positive")
|
||||||
|
|
||||||
if (buf[0] in REPEATS) and buf_filled_with(buf, buf[0]):
|
if (buf[0] in REPEATS) and buf_filled_with(buf, buf[0]):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
105
tests/test_strings.py
Normal file
105
tests/test_strings.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
# Copyright 2025 Google LLC
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from capa.features.extractors.strings import (
|
||||||
|
String,
|
||||||
|
buf_filled_with,
|
||||||
|
is_printable_str,
|
||||||
|
extract_ascii_strings,
|
||||||
|
extract_unicode_strings,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_buf_filled_with():
|
||||||
|
# single repeating byte
|
||||||
|
assert buf_filled_with(b"\x00" * 8, 0x00) is True
|
||||||
|
assert buf_filled_with(b"\xFF" * 8, 0xFF) is True
|
||||||
|
|
||||||
|
# mixed bytes
|
||||||
|
assert buf_filled_with(b"\x00\x01" * 8, 0x00) is False
|
||||||
|
assert buf_filled_with(b"ABCD" * 8, ord("A")) is False
|
||||||
|
|
||||||
|
# edge cases
|
||||||
|
assert buf_filled_with(b"", 0x00) is False # Empty buffer
|
||||||
|
assert buf_filled_with(b"\x00", 0x00) is True # Single byte
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_ascii_strings():
|
||||||
|
# test empty buffer
|
||||||
|
assert list(extract_ascii_strings(b"")) == []
|
||||||
|
|
||||||
|
buf = b"Hello World\x00This is a test\x00"
|
||||||
|
strings = list(extract_ascii_strings(buf))
|
||||||
|
assert len(strings) == 2
|
||||||
|
assert strings[0] == String("Hello World", 0)
|
||||||
|
assert strings[1] == String("This is a test", 12)
|
||||||
|
|
||||||
|
# min length
|
||||||
|
buf = b"Hi\x00Test\x00"
|
||||||
|
strings = list(extract_ascii_strings(buf, n=4))
|
||||||
|
assert len(strings) == 1
|
||||||
|
assert strings[0] == String("Test", 3)
|
||||||
|
|
||||||
|
# non-ASCII strings
|
||||||
|
buf = b"Hello\xFFWorld\x00"
|
||||||
|
strings = list(extract_ascii_strings(buf))
|
||||||
|
assert len(strings) == 2
|
||||||
|
assert strings[0] == String("Hello", 0)
|
||||||
|
assert strings[1] == String("World", 6)
|
||||||
|
|
||||||
|
# only non-ASCII
|
||||||
|
assert list(extract_ascii_strings(b"\xFF\xFF\xFF")) == []
|
||||||
|
|
||||||
|
buf = b"\x00" * 8 + b"ValidString\x00"
|
||||||
|
strings = list(extract_ascii_strings(buf))
|
||||||
|
assert len(strings) == 1
|
||||||
|
assert strings[0] == String("ValidString", 8)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_unicode_strings():
|
||||||
|
buf = b"H\x00e\x00l\x00l\x00o\x00\x00\x00"
|
||||||
|
strings = list(extract_unicode_strings(buf))
|
||||||
|
assert len(strings) == 1
|
||||||
|
assert strings[0] == String("Hello", 0)
|
||||||
|
|
||||||
|
# min length
|
||||||
|
buf = b"H\x00i\x00\x00\x00T\x00e\x00s\x00t\x00\x00\x00"
|
||||||
|
strings = list(extract_unicode_strings(buf, n=4))
|
||||||
|
assert len(strings) == 1
|
||||||
|
assert strings[0] == String("Test", 6)
|
||||||
|
|
||||||
|
# invalid Unicode sequences
|
||||||
|
buf = b"H\x00\xFF\x00l\x00l\x00o\x00\x00\x00"
|
||||||
|
strings = list(extract_unicode_strings(buf))
|
||||||
|
assert len(strings) == 0
|
||||||
|
|
||||||
|
# repeating bytes (should be skipped)
|
||||||
|
buf = b"\x00" * 8 + b"V\x00a\x00l\x00i\x00d\x00\x00\x00"
|
||||||
|
strings = list(extract_unicode_strings(buf))
|
||||||
|
assert len(strings) == 1
|
||||||
|
assert strings[0] == String("Valid", 8)
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_printable_str():
|
||||||
|
assert is_printable_str("Hello World") is True
|
||||||
|
assert is_printable_str("123!@#") is True
|
||||||
|
assert is_printable_str("\t\n\r") is True # whitespace is printable
|
||||||
|
|
||||||
|
assert is_printable_str("\x00\x01\x02") is False
|
||||||
|
assert is_printable_str("Hello\x07World") is False
|
||||||
|
assert is_printable_str("\x1b[31m") is False # ANSI escape codes
|
||||||
|
|
||||||
|
assert is_printable_str("") is True # empty string
|
||||||
|
assert is_printable_str(" ") is True # single space
|
||||||
|
assert is_printable_str("\x7f") is False # DEL character
|
||||||
Reference in New Issue
Block a user