mirror of
https://github.com/mandiant/capa.git
synced 2025-12-05 20:40:05 -08:00
initial attempt at library identification via known strings
This commit is contained in:
0
capa/analysis/__init__.py
Normal file
0
capa/analysis/__init__.py
Normal file
193
capa/analysis/libraries.py
Normal file
193
capa/analysis/libraries.py
Normal file
@@ -0,0 +1,193 @@
|
||||
"""
|
||||
further requirements:
|
||||
- nltk
|
||||
"""
|
||||
|
||||
import sys
|
||||
import logging
|
||||
import collections
|
||||
from pathlib import Path
|
||||
|
||||
import rich
|
||||
from rich.text import Text
|
||||
|
||||
import capa.analysis.strings
|
||||
import capa.features.extractors.strings
|
||||
from capa.analysis.strings import LibraryStringDatabase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_strings(buf, n=4):
|
||||
yield from capa.features.extractors.strings.extract_ascii_strings(buf, n=n)
|
||||
yield from capa.features.extractors.strings.extract_unicode_strings(buf, n=n)
|
||||
|
||||
|
||||
def prune_databases(dbs: list[LibraryStringDatabase], n=8):
|
||||
"""remove less trustyworthy database entries.
|
||||
|
||||
such as:
|
||||
- those found in multiple databases
|
||||
- those that are English words
|
||||
- those that are too short
|
||||
- Windows API and DLL names
|
||||
"""
|
||||
|
||||
# TODO: consider applying these filters directly to the persisted databases, not at load time.
|
||||
|
||||
winapi = capa.analysis.strings.WindowsApiStringDatabase.from_defaults()
|
||||
|
||||
try:
|
||||
from nltk.corpus import words as nltk_words
|
||||
except ImportError:
|
||||
# one-time download of dataset.
|
||||
# this probably doesn't work well for embedded use.
|
||||
import nltk
|
||||
nltk.download("words")
|
||||
from nltk.corpus import words as nltk_words
|
||||
words = set(nltk_words.words())
|
||||
|
||||
counter = collections.Counter()
|
||||
to_remove = set()
|
||||
for db in dbs:
|
||||
for string in db.metadata_by_string.keys():
|
||||
counter[string] += 1
|
||||
|
||||
if string in words:
|
||||
to_remove.add(string)
|
||||
continue
|
||||
|
||||
if len(string) < n:
|
||||
to_remove.add(string)
|
||||
continue
|
||||
|
||||
if string in winapi.api_names:
|
||||
to_remove.add(string)
|
||||
continue
|
||||
|
||||
if string in winapi.dll_names:
|
||||
to_remove.add(string)
|
||||
continue
|
||||
|
||||
for string, count in counter.most_common():
|
||||
if count <= 1:
|
||||
break
|
||||
|
||||
# remove strings that are seen in more than one database
|
||||
to_remove.add(string)
|
||||
|
||||
for db in dbs:
|
||||
for string in to_remove:
|
||||
if string in db.metadata_by_string:
|
||||
del db.metadata_by_string[string]
|
||||
|
||||
|
||||
def open_ida(input_path: Path):
|
||||
import tempfile
|
||||
|
||||
import idapro
|
||||
|
||||
t = Path(tempfile.mkdtemp(prefix="ida-")) / input_path.name
|
||||
t.write_bytes(input_path.read_bytes())
|
||||
# resource leak: we should delete this upon exit
|
||||
|
||||
idapro.enable_console_messages(False)
|
||||
idapro.open_database(str(t.absolute()), run_auto_analysis=True)
|
||||
|
||||
import ida_auto
|
||||
ida_auto.auto_wait()
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
# use n=8 to ignore common words
|
||||
N = 8
|
||||
|
||||
input_path = Path(sys.argv[1])
|
||||
input_buf = input_path.read_bytes()
|
||||
|
||||
dbs = capa.analysis.strings.get_default_databases()
|
||||
prune_databases(dbs, n=N)
|
||||
|
||||
strings_by_library = collections.defaultdict(set)
|
||||
for string in extract_strings(input_path.read_bytes(), n=N):
|
||||
for db in dbs:
|
||||
if (metadata := db.metadata_by_string.get(string.s)):
|
||||
strings_by_library[metadata.library_name].add(string.s)
|
||||
|
||||
console = rich.get_console()
|
||||
console.print(f"found libraries:", style="bold")
|
||||
for library, strings in sorted(strings_by_library.items(), key=lambda p: len(p[1]), reverse=True):
|
||||
console.print(f" - [b]{library}[/] ({len(strings)} strings)")
|
||||
|
||||
for string in sorted(strings)[:10]:
|
||||
console.print(f" - {string}", markup=False, style="grey37")
|
||||
|
||||
if len(strings) > 10:
|
||||
console.print(" ...", style="grey37")
|
||||
|
||||
if not strings_by_library:
|
||||
console.print(" (none)", style="grey37")
|
||||
# since we're not going to find any strings
|
||||
# return early and don't do IDA analysis
|
||||
return
|
||||
|
||||
# TODO: ensure there are XXX matches for each library, or ignore those entries
|
||||
|
||||
open_ida(input_path)
|
||||
|
||||
import idaapi
|
||||
import idautils
|
||||
import ida_funcs
|
||||
import capa.features.extractors.ida.helpers as ida_helpers
|
||||
|
||||
strings_by_function = collections.defaultdict(set)
|
||||
for ea in idautils.Functions():
|
||||
f = idaapi.get_func(ea)
|
||||
|
||||
# ignore library functions and thunk functions as identified by IDA
|
||||
if f.flags & idaapi.FUNC_THUNK:
|
||||
continue
|
||||
if f.flags & idaapi.FUNC_LIB:
|
||||
continue
|
||||
|
||||
for bb in ida_helpers.get_function_blocks(f):
|
||||
for insn in ida_helpers.get_instructions_in_range(bb.start_ea, bb.end_ea):
|
||||
ref = capa.features.extractors.ida.helpers.find_data_reference_from_insn(insn)
|
||||
if ref == insn.ea:
|
||||
continue
|
||||
|
||||
string = capa.features.extractors.ida.helpers.find_string_at(ref)
|
||||
if not string:
|
||||
continue
|
||||
|
||||
for db in dbs:
|
||||
if (metadata := db.metadata_by_string.get(string)):
|
||||
strings_by_function[ea].add(string)
|
||||
|
||||
# ensure there are at least XXX functions renamed, or ignore those entries
|
||||
|
||||
console.print("functions:", style="bold")
|
||||
for function, strings in sorted(strings_by_function.items()):
|
||||
if strings:
|
||||
name = ida_funcs.get_func_name(function)
|
||||
|
||||
console.print(f" [b]{name}[/]@{function:08x}:")
|
||||
|
||||
for string in strings:
|
||||
for db in dbs:
|
||||
if (metadata := db.metadata_by_string.get(string)):
|
||||
location = Text(f"{metadata.library_name}@{metadata.library_version}::{metadata.function_name}", style="grey37")
|
||||
console.print(" - ", location, ": ", string.rstrip())
|
||||
|
||||
# TODO: ensure there aren't conflicts among the matches
|
||||
|
||||
console.print()
|
||||
|
||||
console.print(f"found {len(strings_by_function)} library functions across {len(list(idautils.Functions()))} functions")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
95
capa/analysis/strings/__init__.py
Normal file
95
capa/analysis/strings/__init__.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import gzip
|
||||
import pathlib
|
||||
from typing import Dict, Sequence
|
||||
from dataclasses import dataclass
|
||||
|
||||
import msgspec
|
||||
|
||||
|
||||
class LibraryString(msgspec.Struct):
|
||||
string: str
|
||||
library_name: str
|
||||
library_version: str
|
||||
file_path: str | None = None
|
||||
function_name: str | None = None
|
||||
line_number: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class LibraryStringDatabase:
|
||||
metadata_by_string: Dict[str, LibraryString]
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.metadata_by_string)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, path: pathlib.Path) -> "LibraryStringDatabase":
|
||||
metadata_by_string: Dict[str, LibraryString] = {}
|
||||
decoder = msgspec.json.Decoder(type=LibraryString)
|
||||
for line in gzip.decompress(path.read_bytes()).split(b"\n"):
|
||||
if not line:
|
||||
continue
|
||||
s = decoder.decode(line)
|
||||
metadata_by_string[s.string] = s
|
||||
|
||||
return cls(metadata_by_string=metadata_by_string)
|
||||
|
||||
|
||||
DEFAULT_FILENAMES = (
|
||||
"brotli.jsonl.gz",
|
||||
"bzip2.jsonl.gz",
|
||||
"cryptopp.jsonl.gz",
|
||||
"curl.jsonl.gz",
|
||||
"detours.jsonl.gz",
|
||||
"jemalloc.jsonl.gz",
|
||||
"jsoncpp.jsonl.gz",
|
||||
"kcp.jsonl.gz",
|
||||
"liblzma.jsonl.gz",
|
||||
"libsodium.jsonl.gz",
|
||||
"libpcap.jsonl.gz",
|
||||
"mbedtls.jsonl.gz",
|
||||
"openssl.jsonl.gz",
|
||||
"sqlite3.jsonl.gz",
|
||||
"tomcrypt.jsonl.gz",
|
||||
"wolfssl.jsonl.gz",
|
||||
"zlib.jsonl.gz",
|
||||
)
|
||||
|
||||
DEFAULT_PATHS = tuple(
|
||||
pathlib.Path(__file__).parent / "data" / "oss" / filename for filename in DEFAULT_FILENAMES
|
||||
) + (pathlib.Path(__file__).parent / "data" / "crt" / "msvc_v143.jsonl.gz",)
|
||||
|
||||
|
||||
def get_default_databases() -> Sequence[LibraryStringDatabase]:
|
||||
return [LibraryStringDatabase.from_file(path) for path in DEFAULT_PATHS]
|
||||
|
||||
|
||||
@dataclass
|
||||
class WindowsApiStringDatabase:
|
||||
dll_names: set[str]
|
||||
api_names: set[str]
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.dll_names) + len(self.api_names)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, path: pathlib.Path) -> "WindowsApiStringDatabase":
|
||||
dll_names: Set[str] = set()
|
||||
api_names: Set[str] = set()
|
||||
|
||||
for line in gzip.decompress((path / "dlls.txt.gz").read_bytes()).decode("utf-8").splitlines():
|
||||
if not line:
|
||||
continue
|
||||
dll_names.add(line)
|
||||
|
||||
for line in gzip.decompress((path / "apis.txt.gz").read_bytes()).decode("utf-8").splitlines():
|
||||
if not line:
|
||||
continue
|
||||
api_names.add(line)
|
||||
|
||||
return cls(dll_names=dll_names, api_names=api_names)
|
||||
|
||||
@classmethod
|
||||
def from_defaults(cls) -> "WindowsApiStringDatabase":
|
||||
return cls.from_dir(pathlib.Path(__file__).parent / "data" / "winapi")
|
||||
|
||||
BIN
capa/analysis/strings/data/crt/msvc_v143.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/crt/msvc_v143.jsonl.gz
Normal file
Binary file not shown.
3
capa/analysis/strings/data/oss/.gitignore
vendored
Normal file
3
capa/analysis/strings/data/oss/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
*.csv
|
||||
*.jsonl
|
||||
*.jsonl.gz
|
||||
BIN
capa/analysis/strings/data/oss/brotli.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/brotli.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/bzip2.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/bzip2.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/cryptopp.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/cryptopp.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/curl.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/curl.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/detours.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/detours.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/jemalloc.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/jemalloc.jsonl.gz
Normal file
Binary file not shown.
52
capa/analysis/strings/data/oss/jh_to_qs.py
Normal file
52
capa/analysis/strings/data/oss/jh_to_qs.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
convert from a jh CSV file to a .jsonl.gz OpenSourceString database.
|
||||
|
||||
the jh file looks like:
|
||||
|
||||
# triplet,compiler,library,version,profile,path,function,type,value
|
||||
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0x00000100
|
||||
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0xfffffff8
|
||||
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0xfffffffe
|
||||
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,api,BZ2_bzCompressInit
|
||||
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,api,handle_compress
|
||||
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0x0000fa90
|
||||
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffff8
|
||||
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffff9
|
||||
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffffd
|
||||
|
||||
jh is found here: https://github.com/williballenthin/lancelot/blob/master/bin/src/bin/jh.rs
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
import pathlib
|
||||
|
||||
import msgspec
|
||||
|
||||
from capa.analysis.strings import LibraryString
|
||||
|
||||
p = pathlib.Path(sys.argv[1])
|
||||
for line in p.read_text().split("\n"):
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
|
||||
triplet, compiler, library, version, profile, path, function, rest = line.split(",", 7)
|
||||
type, _, value = rest.partition(",")
|
||||
if type != "string":
|
||||
continue
|
||||
|
||||
if value.startswith('"'):
|
||||
value = json.loads(value)
|
||||
|
||||
s = LibraryString(
|
||||
string=value,
|
||||
library_name=library,
|
||||
library_version=version,
|
||||
file_path=path,
|
||||
function_name=function,
|
||||
)
|
||||
|
||||
sys.stdout.buffer.write(msgspec.json.encode(s))
|
||||
sys.stdout.buffer.write(b"\n")
|
||||
BIN
capa/analysis/strings/data/oss/jsoncpp.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/jsoncpp.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/kcp.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/kcp.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/liblzma.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/liblzma.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/libpcap.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/libpcap.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/libsodium.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/libsodium.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/mbedtls.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/mbedtls.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/openssl.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/openssl.jsonl.gz
Normal file
Binary file not shown.
99
capa/analysis/strings/data/oss/readme.md
Normal file
99
capa/analysis/strings/data/oss/readme.md
Normal file
@@ -0,0 +1,99 @@
|
||||
# Strings from Open Source libraries
|
||||
|
||||
This directory contains databases of strings extracted from open soure software.
|
||||
capa uses these databases to ignore functions that are likely library code.
|
||||
|
||||
There is one file for each database. Each database is a gzip-compressed, JSONL (one JSON document per line) file.
|
||||
The JSON document looks like this:
|
||||
|
||||
string: "1.0.8, 13-Jul-2019"
|
||||
library_name: "bzip2"
|
||||
library_version: "1.0.8#3"
|
||||
file_path: "CMakeFiles/bz2.dir/bzlib.c.obj"
|
||||
function_name: "BZ2_bzlibVersion"
|
||||
line_number: null
|
||||
|
||||
The following databases were extracted via the vkpkg & jh technique:
|
||||
|
||||
- brotli 1.0.9#5
|
||||
- bzip2 1.0.8#3
|
||||
- cryptopp 8.7.0
|
||||
- curl 7.86.0#1
|
||||
- detours 4.0.1#7
|
||||
- jemalloc 5.3.0#1
|
||||
- jsoncpp 1.9.5
|
||||
- kcp 1.7
|
||||
- liblzma 5.2.5#6
|
||||
- libsodium 1.0.18#8
|
||||
- libpcap 1.10.1#3
|
||||
- mbedtls 2.28.1
|
||||
- openssl 3.0.7#1
|
||||
- sqlite3 3.40.0#1
|
||||
- tomcrypt 1.18.2#2
|
||||
- wolfssl 5.5.0
|
||||
- zlib 1.2.13
|
||||
|
||||
This code was originally developed in FLOSS and imported into capa.
|
||||
|
||||
## The vkpkg & jh technique
|
||||
|
||||
Major steps:
|
||||
|
||||
1. build static libraries via vcpkg
|
||||
2. extract features via jh
|
||||
3. convert to JSONL format with `jh_to_qs.py`
|
||||
4. compress with gzip
|
||||
|
||||
### Build static libraries via vcpkg
|
||||
|
||||
[vcpkg](https://vcpkg.io/en/) is a free C/C++ package manager for acquiring and managing libraries.
|
||||
We use it to easily build common open source libraries, like zlib.
|
||||
Use the triplet `x64-windows-static` to build static archives (.lib files that are AR archives containing COFF object files):
|
||||
|
||||
```console
|
||||
PS > C:\vcpkg\vcpkg.exe install --triplet x64-windows-static zlib
|
||||
```
|
||||
|
||||
### Extract features via jh
|
||||
|
||||
[jh](https://github.com/williballenthin/lancelot/blob/master/bin/src/bin/jh.rs)
|
||||
is a lancelot-based utility that parses AR archives containing COFF object files,
|
||||
reconstructs their control flow, finds functions, and extracts features.
|
||||
jh extracts numbers, API calls, and strings; we are only interested in the string features.
|
||||
|
||||
For each feature, jh emits a CSV line with the fields
|
||||
- target triplet
|
||||
- compiler
|
||||
- library
|
||||
- version
|
||||
- build profile
|
||||
- path
|
||||
- function
|
||||
- feature type
|
||||
- feature value
|
||||
|
||||
For example:
|
||||
|
||||
```csv
|
||||
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0x00000100
|
||||
```
|
||||
|
||||
For example, to invoke jh:
|
||||
|
||||
```console
|
||||
$ ~/lancelot/target/release/jh x64-windows-static msvc143 zlib 1.2.13 release /mnt/c/vcpkg/installed/x64-windows-static/lib/zlib.lib > ~/flare-floss/floss/qs/db/data/oss/zlib.csv
|
||||
```
|
||||
|
||||
### Convert to OSS database format
|
||||
|
||||
We use the script `jh_to_qs.py` to convert these CSV lines into JSONL file prepared for FLOSS:
|
||||
|
||||
```console
|
||||
$ python3 jh_to_qs.py zlib.csv > zlib.jsonl
|
||||
```
|
||||
|
||||
These files are then gzip'd:
|
||||
|
||||
```console
|
||||
$ gzip -c zlib.jsonl > zlib.jsonl.gz
|
||||
```
|
||||
BIN
capa/analysis/strings/data/oss/sqlite3.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/sqlite3.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/tomcrypt.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/tomcrypt.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/wolfssl.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/wolfssl.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/zlib.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/zlib.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/winapi/apis.txt.gz
Normal file
BIN
capa/analysis/strings/data/winapi/apis.txt.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/winapi/dlls.txt.gz
Normal file
BIN
capa/analysis/strings/data/winapi/dlls.txt.gz
Normal file
Binary file not shown.
Reference in New Issue
Block a user