mirror of
https://github.com/mandiant/capa.git
synced 2025-12-17 01:47:54 -08:00
Compare commits
1 Commits
mr/library
...
wb/library
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
03ce40e781 |
0
capa/analysis/__init__.py
Normal file
0
capa/analysis/__init__.py
Normal file
@@ -1,164 +0,0 @@
|
|||||||
import io
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import logging
|
|
||||||
import argparse
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import rich
|
|
||||||
from rich.console import Console
|
|
||||||
from rich.logging import RichHandler
|
|
||||||
|
|
||||||
import capa.helpers
|
|
||||||
import capa.features.extractors.ida.idalib as idalib
|
|
||||||
|
|
||||||
if not idalib.has_idalib():
|
|
||||||
raise RuntimeError("cannot find IDA idalib module.")
|
|
||||||
|
|
||||||
if not idalib.load_idalib():
|
|
||||||
raise RuntimeError("failed to load IDA idalib module.")
|
|
||||||
|
|
||||||
import idaapi
|
|
||||||
import idapro
|
|
||||||
import ida_auto
|
|
||||||
import idautils
|
|
||||||
import ida_funcs
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
|
|
||||||
def colorbool(v: bool) -> str:
|
|
||||||
if v:
|
|
||||||
return f"[green]{str(v)}[/green]"
|
|
||||||
else:
|
|
||||||
return f"[red]{str(v)}[/red]"
|
|
||||||
|
|
||||||
|
|
||||||
def colorname(n: str) -> str:
|
|
||||||
if n.startswith("sub_"):
|
|
||||||
return n
|
|
||||||
else:
|
|
||||||
return f"[cyan]{n}[/cyan]"
|
|
||||||
|
|
||||||
|
|
||||||
class FunctionId(BaseModel):
|
|
||||||
address: int
|
|
||||||
is_library: bool
|
|
||||||
is_thunk: bool
|
|
||||||
name: str
|
|
||||||
|
|
||||||
def to_row(self):
|
|
||||||
row = [hex(self.address)]
|
|
||||||
row.append(colorbool(self.is_library))
|
|
||||||
row.append(colorbool(self.is_thunk))
|
|
||||||
row.append(colorname(self.name))
|
|
||||||
return row
|
|
||||||
|
|
||||||
|
|
||||||
def configure_logging(args):
|
|
||||||
if args.quiet:
|
|
||||||
logging.getLogger().setLevel(logging.WARNING)
|
|
||||||
elif args.debug:
|
|
||||||
logging.getLogger().setLevel(logging.DEBUG)
|
|
||||||
else:
|
|
||||||
logging.getLogger().setLevel(logging.INFO)
|
|
||||||
|
|
||||||
# use [/] after the logger name to reset any styling,
|
|
||||||
# and prevent the color from carrying over to the message
|
|
||||||
logformat = "[dim]%(name)s[/]: %(message)s"
|
|
||||||
|
|
||||||
# set markup=True to allow the use of Rich's markup syntax in log messages
|
|
||||||
rich_handler = RichHandler(markup=True, show_time=False, show_path=True, console=capa.helpers.log_console)
|
|
||||||
rich_handler.setFormatter(logging.Formatter(logformat))
|
|
||||||
|
|
||||||
# use RichHandler for root logger
|
|
||||||
logging.getLogger().addHandler(rich_handler)
|
|
||||||
|
|
||||||
if args.debug:
|
|
||||||
logging.getLogger("capa").setLevel(logging.DEBUG)
|
|
||||||
logging.getLogger("viv_utils").setLevel(logging.DEBUG)
|
|
||||||
else:
|
|
||||||
logging.getLogger("capa").setLevel(logging.ERROR)
|
|
||||||
logging.getLogger("viv_utils").setLevel(logging.ERROR)
|
|
||||||
|
|
||||||
|
|
||||||
def main(argv=None):
|
|
||||||
if argv is None:
|
|
||||||
argv = sys.argv[1:]
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Identify library functions using FLIRT.")
|
|
||||||
parser.add_argument(
|
|
||||||
"input_file",
|
|
||||||
type=Path,
|
|
||||||
help="path to file to analyze",
|
|
||||||
)
|
|
||||||
parser.add_argument("-d", "--debug", action="store_true", help="enable debugging output on STDERR")
|
|
||||||
parser.add_argument("-q", "--quiet", action="store_true", help="disable all output but errors")
|
|
||||||
args = parser.parse_args(args=argv)
|
|
||||||
|
|
||||||
configure_logging(args)
|
|
||||||
|
|
||||||
time0 = time.time()
|
|
||||||
|
|
||||||
# stderr=True is used here to redirect the spinner banner to stderr, so that users can redirect capa's output.
|
|
||||||
console = Console(stderr=True, quiet=False)
|
|
||||||
|
|
||||||
logger.debug("idalib: opening database...")
|
|
||||||
# idalib writes to stdout (ugh), so we have to capture that
|
|
||||||
# so as not to screw up structured output.
|
|
||||||
with capa.helpers.stdout_redirector(io.BytesIO()):
|
|
||||||
with console.status("analyzing program...", spinner="dots"):
|
|
||||||
if idapro.open_database(str(args.input_file), run_auto_analysis=True):
|
|
||||||
raise RuntimeError("failed to analyze input file")
|
|
||||||
|
|
||||||
logger.debug("idalib: waiting for analysis...")
|
|
||||||
|
|
||||||
# TODO: add more signature (files)
|
|
||||||
# TOOD: apply more signatures
|
|
||||||
|
|
||||||
ida_auto.auto_wait()
|
|
||||||
logger.debug("idalib: opened database.")
|
|
||||||
|
|
||||||
table = rich.table.Table()
|
|
||||||
table.add_column("FVA")
|
|
||||||
table.add_column("library?")
|
|
||||||
table.add_column("thunk?")
|
|
||||||
table.add_column("name")
|
|
||||||
|
|
||||||
LIBONLY = True
|
|
||||||
count = 0
|
|
||||||
|
|
||||||
for ea in idautils.Functions(start=None, end=None):
|
|
||||||
f = idaapi.get_func(ea)
|
|
||||||
is_thunk = bool(f.flags & idaapi.FUNC_THUNK)
|
|
||||||
is_lib = bool(f.flags & idaapi.FUNC_LIB)
|
|
||||||
fname = idaapi.get_func_name(ea)
|
|
||||||
|
|
||||||
if LIBONLY and not is_lib:
|
|
||||||
continue
|
|
||||||
|
|
||||||
fid = FunctionId(address=ea, is_library=is_lib, is_thunk=is_thunk, name=fname)
|
|
||||||
table.add_row(*fid.to_row())
|
|
||||||
|
|
||||||
count += 1
|
|
||||||
if count > 50:
|
|
||||||
break
|
|
||||||
|
|
||||||
rich.print(table)
|
|
||||||
|
|
||||||
# TODO can we include which signature matched per function?
|
|
||||||
for index in range(0, ida_funcs.get_idasgn_qty()):
|
|
||||||
signame, optlibs, nmatches = ida_funcs.get_idasgn_desc_with_matches(index)
|
|
||||||
rich.print(signame, optlibs, nmatches)
|
|
||||||
|
|
||||||
idapro.close_database()
|
|
||||||
|
|
||||||
min, sec = divmod(time.time() - time0, 60)
|
|
||||||
logger.debug("FLIRT-based library identification ran for ~ %02d:%02dm", min, sec)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
193
capa/analysis/libraries.py
Normal file
193
capa/analysis/libraries.py
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
"""
|
||||||
|
further requirements:
|
||||||
|
- nltk
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
import collections
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import rich
|
||||||
|
from rich.text import Text
|
||||||
|
|
||||||
|
import capa.analysis.strings
|
||||||
|
import capa.features.extractors.strings
|
||||||
|
from capa.analysis.strings import LibraryStringDatabase
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_strings(buf, n=4):
|
||||||
|
yield from capa.features.extractors.strings.extract_ascii_strings(buf, n=n)
|
||||||
|
yield from capa.features.extractors.strings.extract_unicode_strings(buf, n=n)
|
||||||
|
|
||||||
|
|
||||||
|
def prune_databases(dbs: list[LibraryStringDatabase], n=8):
|
||||||
|
"""remove less trustyworthy database entries.
|
||||||
|
|
||||||
|
such as:
|
||||||
|
- those found in multiple databases
|
||||||
|
- those that are English words
|
||||||
|
- those that are too short
|
||||||
|
- Windows API and DLL names
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO: consider applying these filters directly to the persisted databases, not at load time.
|
||||||
|
|
||||||
|
winapi = capa.analysis.strings.WindowsApiStringDatabase.from_defaults()
|
||||||
|
|
||||||
|
try:
|
||||||
|
from nltk.corpus import words as nltk_words
|
||||||
|
except ImportError:
|
||||||
|
# one-time download of dataset.
|
||||||
|
# this probably doesn't work well for embedded use.
|
||||||
|
import nltk
|
||||||
|
nltk.download("words")
|
||||||
|
from nltk.corpus import words as nltk_words
|
||||||
|
words = set(nltk_words.words())
|
||||||
|
|
||||||
|
counter = collections.Counter()
|
||||||
|
to_remove = set()
|
||||||
|
for db in dbs:
|
||||||
|
for string in db.metadata_by_string.keys():
|
||||||
|
counter[string] += 1
|
||||||
|
|
||||||
|
if string in words:
|
||||||
|
to_remove.add(string)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(string) < n:
|
||||||
|
to_remove.add(string)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if string in winapi.api_names:
|
||||||
|
to_remove.add(string)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if string in winapi.dll_names:
|
||||||
|
to_remove.add(string)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for string, count in counter.most_common():
|
||||||
|
if count <= 1:
|
||||||
|
break
|
||||||
|
|
||||||
|
# remove strings that are seen in more than one database
|
||||||
|
to_remove.add(string)
|
||||||
|
|
||||||
|
for db in dbs:
|
||||||
|
for string in to_remove:
|
||||||
|
if string in db.metadata_by_string:
|
||||||
|
del db.metadata_by_string[string]
|
||||||
|
|
||||||
|
|
||||||
|
def open_ida(input_path: Path):
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import idapro
|
||||||
|
|
||||||
|
t = Path(tempfile.mkdtemp(prefix="ida-")) / input_path.name
|
||||||
|
t.write_bytes(input_path.read_bytes())
|
||||||
|
# resource leak: we should delete this upon exit
|
||||||
|
|
||||||
|
idapro.enable_console_messages(False)
|
||||||
|
idapro.open_database(str(t.absolute()), run_auto_analysis=True)
|
||||||
|
|
||||||
|
import ida_auto
|
||||||
|
ida_auto.auto_wait()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
# use n=8 to ignore common words
|
||||||
|
N = 8
|
||||||
|
|
||||||
|
input_path = Path(sys.argv[1])
|
||||||
|
input_buf = input_path.read_bytes()
|
||||||
|
|
||||||
|
dbs = capa.analysis.strings.get_default_databases()
|
||||||
|
prune_databases(dbs, n=N)
|
||||||
|
|
||||||
|
strings_by_library = collections.defaultdict(set)
|
||||||
|
for string in extract_strings(input_path.read_bytes(), n=N):
|
||||||
|
for db in dbs:
|
||||||
|
if (metadata := db.metadata_by_string.get(string.s)):
|
||||||
|
strings_by_library[metadata.library_name].add(string.s)
|
||||||
|
|
||||||
|
console = rich.get_console()
|
||||||
|
console.print(f"found libraries:", style="bold")
|
||||||
|
for library, strings in sorted(strings_by_library.items(), key=lambda p: len(p[1]), reverse=True):
|
||||||
|
console.print(f" - [b]{library}[/] ({len(strings)} strings)")
|
||||||
|
|
||||||
|
for string in sorted(strings)[:10]:
|
||||||
|
console.print(f" - {string}", markup=False, style="grey37")
|
||||||
|
|
||||||
|
if len(strings) > 10:
|
||||||
|
console.print(" ...", style="grey37")
|
||||||
|
|
||||||
|
if not strings_by_library:
|
||||||
|
console.print(" (none)", style="grey37")
|
||||||
|
# since we're not going to find any strings
|
||||||
|
# return early and don't do IDA analysis
|
||||||
|
return
|
||||||
|
|
||||||
|
# TODO: ensure there are XXX matches for each library, or ignore those entries
|
||||||
|
|
||||||
|
open_ida(input_path)
|
||||||
|
|
||||||
|
import idaapi
|
||||||
|
import idautils
|
||||||
|
import ida_funcs
|
||||||
|
import capa.features.extractors.ida.helpers as ida_helpers
|
||||||
|
|
||||||
|
strings_by_function = collections.defaultdict(set)
|
||||||
|
for ea in idautils.Functions():
|
||||||
|
f = idaapi.get_func(ea)
|
||||||
|
|
||||||
|
# ignore library functions and thunk functions as identified by IDA
|
||||||
|
if f.flags & idaapi.FUNC_THUNK:
|
||||||
|
continue
|
||||||
|
if f.flags & idaapi.FUNC_LIB:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for bb in ida_helpers.get_function_blocks(f):
|
||||||
|
for insn in ida_helpers.get_instructions_in_range(bb.start_ea, bb.end_ea):
|
||||||
|
ref = capa.features.extractors.ida.helpers.find_data_reference_from_insn(insn)
|
||||||
|
if ref == insn.ea:
|
||||||
|
continue
|
||||||
|
|
||||||
|
string = capa.features.extractors.ida.helpers.find_string_at(ref)
|
||||||
|
if not string:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for db in dbs:
|
||||||
|
if (metadata := db.metadata_by_string.get(string)):
|
||||||
|
strings_by_function[ea].add(string)
|
||||||
|
|
||||||
|
# ensure there are at least XXX functions renamed, or ignore those entries
|
||||||
|
|
||||||
|
console.print("functions:", style="bold")
|
||||||
|
for function, strings in sorted(strings_by_function.items()):
|
||||||
|
if strings:
|
||||||
|
name = ida_funcs.get_func_name(function)
|
||||||
|
|
||||||
|
console.print(f" [b]{name}[/]@{function:08x}:")
|
||||||
|
|
||||||
|
for string in strings:
|
||||||
|
for db in dbs:
|
||||||
|
if (metadata := db.metadata_by_string.get(string)):
|
||||||
|
location = Text(f"{metadata.library_name}@{metadata.library_version}::{metadata.function_name}", style="grey37")
|
||||||
|
console.print(" - ", location, ": ", string.rstrip())
|
||||||
|
|
||||||
|
# TODO: ensure there aren't conflicts among the matches
|
||||||
|
|
||||||
|
console.print()
|
||||||
|
|
||||||
|
console.print(f"found {len(strings_by_function)} library functions across {len(list(idautils.Functions()))} functions")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
95
capa/analysis/strings/__init__.py
Normal file
95
capa/analysis/strings/__init__.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
import gzip
|
||||||
|
import pathlib
|
||||||
|
from typing import Dict, Sequence
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import msgspec
|
||||||
|
|
||||||
|
|
||||||
|
class LibraryString(msgspec.Struct):
|
||||||
|
string: str
|
||||||
|
library_name: str
|
||||||
|
library_version: str
|
||||||
|
file_path: str | None = None
|
||||||
|
function_name: str | None = None
|
||||||
|
line_number: int | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LibraryStringDatabase:
|
||||||
|
metadata_by_string: Dict[str, LibraryString]
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self.metadata_by_string)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_file(cls, path: pathlib.Path) -> "LibraryStringDatabase":
|
||||||
|
metadata_by_string: Dict[str, LibraryString] = {}
|
||||||
|
decoder = msgspec.json.Decoder(type=LibraryString)
|
||||||
|
for line in gzip.decompress(path.read_bytes()).split(b"\n"):
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
s = decoder.decode(line)
|
||||||
|
metadata_by_string[s.string] = s
|
||||||
|
|
||||||
|
return cls(metadata_by_string=metadata_by_string)
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_FILENAMES = (
|
||||||
|
"brotli.jsonl.gz",
|
||||||
|
"bzip2.jsonl.gz",
|
||||||
|
"cryptopp.jsonl.gz",
|
||||||
|
"curl.jsonl.gz",
|
||||||
|
"detours.jsonl.gz",
|
||||||
|
"jemalloc.jsonl.gz",
|
||||||
|
"jsoncpp.jsonl.gz",
|
||||||
|
"kcp.jsonl.gz",
|
||||||
|
"liblzma.jsonl.gz",
|
||||||
|
"libsodium.jsonl.gz",
|
||||||
|
"libpcap.jsonl.gz",
|
||||||
|
"mbedtls.jsonl.gz",
|
||||||
|
"openssl.jsonl.gz",
|
||||||
|
"sqlite3.jsonl.gz",
|
||||||
|
"tomcrypt.jsonl.gz",
|
||||||
|
"wolfssl.jsonl.gz",
|
||||||
|
"zlib.jsonl.gz",
|
||||||
|
)
|
||||||
|
|
||||||
|
DEFAULT_PATHS = tuple(
|
||||||
|
pathlib.Path(__file__).parent / "data" / "oss" / filename for filename in DEFAULT_FILENAMES
|
||||||
|
) + (pathlib.Path(__file__).parent / "data" / "crt" / "msvc_v143.jsonl.gz",)
|
||||||
|
|
||||||
|
|
||||||
|
def get_default_databases() -> Sequence[LibraryStringDatabase]:
|
||||||
|
return [LibraryStringDatabase.from_file(path) for path in DEFAULT_PATHS]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class WindowsApiStringDatabase:
|
||||||
|
dll_names: set[str]
|
||||||
|
api_names: set[str]
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self.dll_names) + len(self.api_names)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dir(cls, path: pathlib.Path) -> "WindowsApiStringDatabase":
|
||||||
|
dll_names: Set[str] = set()
|
||||||
|
api_names: Set[str] = set()
|
||||||
|
|
||||||
|
for line in gzip.decompress((path / "dlls.txt.gz").read_bytes()).decode("utf-8").splitlines():
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
dll_names.add(line)
|
||||||
|
|
||||||
|
for line in gzip.decompress((path / "apis.txt.gz").read_bytes()).decode("utf-8").splitlines():
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
api_names.add(line)
|
||||||
|
|
||||||
|
return cls(dll_names=dll_names, api_names=api_names)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_defaults(cls) -> "WindowsApiStringDatabase":
|
||||||
|
return cls.from_dir(pathlib.Path(__file__).parent / "data" / "winapi")
|
||||||
|
|
||||||
BIN
capa/analysis/strings/data/crt/msvc_v143.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/crt/msvc_v143.jsonl.gz
Normal file
Binary file not shown.
3
capa/analysis/strings/data/oss/.gitignore
vendored
Normal file
3
capa/analysis/strings/data/oss/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
*.csv
|
||||||
|
*.jsonl
|
||||||
|
*.jsonl.gz
|
||||||
BIN
capa/analysis/strings/data/oss/brotli.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/brotli.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/bzip2.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/bzip2.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/cryptopp.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/cryptopp.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/curl.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/curl.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/detours.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/detours.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/jemalloc.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/jemalloc.jsonl.gz
Normal file
Binary file not shown.
52
capa/analysis/strings/data/oss/jh_to_qs.py
Normal file
52
capa/analysis/strings/data/oss/jh_to_qs.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
"""
|
||||||
|
convert from a jh CSV file to a .jsonl.gz OpenSourceString database.
|
||||||
|
|
||||||
|
the jh file looks like:
|
||||||
|
|
||||||
|
# triplet,compiler,library,version,profile,path,function,type,value
|
||||||
|
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0x00000100
|
||||||
|
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0xfffffff8
|
||||||
|
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0xfffffffe
|
||||||
|
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,api,BZ2_bzCompressInit
|
||||||
|
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,api,handle_compress
|
||||||
|
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0x0000fa90
|
||||||
|
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffff8
|
||||||
|
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffff9
|
||||||
|
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffffd
|
||||||
|
|
||||||
|
jh is found here: https://github.com/williballenthin/lancelot/blob/master/bin/src/bin/jh.rs
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
import msgspec
|
||||||
|
|
||||||
|
from capa.analysis.strings import LibraryString
|
||||||
|
|
||||||
|
p = pathlib.Path(sys.argv[1])
|
||||||
|
for line in p.read_text().split("\n"):
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if line.startswith("#"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
triplet, compiler, library, version, profile, path, function, rest = line.split(",", 7)
|
||||||
|
type, _, value = rest.partition(",")
|
||||||
|
if type != "string":
|
||||||
|
continue
|
||||||
|
|
||||||
|
if value.startswith('"'):
|
||||||
|
value = json.loads(value)
|
||||||
|
|
||||||
|
s = LibraryString(
|
||||||
|
string=value,
|
||||||
|
library_name=library,
|
||||||
|
library_version=version,
|
||||||
|
file_path=path,
|
||||||
|
function_name=function,
|
||||||
|
)
|
||||||
|
|
||||||
|
sys.stdout.buffer.write(msgspec.json.encode(s))
|
||||||
|
sys.stdout.buffer.write(b"\n")
|
||||||
BIN
capa/analysis/strings/data/oss/jsoncpp.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/jsoncpp.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/kcp.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/kcp.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/liblzma.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/liblzma.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/libpcap.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/libpcap.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/libsodium.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/libsodium.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/mbedtls.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/mbedtls.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/openssl.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/openssl.jsonl.gz
Normal file
Binary file not shown.
99
capa/analysis/strings/data/oss/readme.md
Normal file
99
capa/analysis/strings/data/oss/readme.md
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
# Strings from Open Source libraries
|
||||||
|
|
||||||
|
This directory contains databases of strings extracted from open soure software.
|
||||||
|
capa uses these databases to ignore functions that are likely library code.
|
||||||
|
|
||||||
|
There is one file for each database. Each database is a gzip-compressed, JSONL (one JSON document per line) file.
|
||||||
|
The JSON document looks like this:
|
||||||
|
|
||||||
|
string: "1.0.8, 13-Jul-2019"
|
||||||
|
library_name: "bzip2"
|
||||||
|
library_version: "1.0.8#3"
|
||||||
|
file_path: "CMakeFiles/bz2.dir/bzlib.c.obj"
|
||||||
|
function_name: "BZ2_bzlibVersion"
|
||||||
|
line_number: null
|
||||||
|
|
||||||
|
The following databases were extracted via the vkpkg & jh technique:
|
||||||
|
|
||||||
|
- brotli 1.0.9#5
|
||||||
|
- bzip2 1.0.8#3
|
||||||
|
- cryptopp 8.7.0
|
||||||
|
- curl 7.86.0#1
|
||||||
|
- detours 4.0.1#7
|
||||||
|
- jemalloc 5.3.0#1
|
||||||
|
- jsoncpp 1.9.5
|
||||||
|
- kcp 1.7
|
||||||
|
- liblzma 5.2.5#6
|
||||||
|
- libsodium 1.0.18#8
|
||||||
|
- libpcap 1.10.1#3
|
||||||
|
- mbedtls 2.28.1
|
||||||
|
- openssl 3.0.7#1
|
||||||
|
- sqlite3 3.40.0#1
|
||||||
|
- tomcrypt 1.18.2#2
|
||||||
|
- wolfssl 5.5.0
|
||||||
|
- zlib 1.2.13
|
||||||
|
|
||||||
|
This code was originally developed in FLOSS and imported into capa.
|
||||||
|
|
||||||
|
## The vkpkg & jh technique
|
||||||
|
|
||||||
|
Major steps:
|
||||||
|
|
||||||
|
1. build static libraries via vcpkg
|
||||||
|
2. extract features via jh
|
||||||
|
3. convert to JSONL format with `jh_to_qs.py`
|
||||||
|
4. compress with gzip
|
||||||
|
|
||||||
|
### Build static libraries via vcpkg
|
||||||
|
|
||||||
|
[vcpkg](https://vcpkg.io/en/) is a free C/C++ package manager for acquiring and managing libraries.
|
||||||
|
We use it to easily build common open source libraries, like zlib.
|
||||||
|
Use the triplet `x64-windows-static` to build static archives (.lib files that are AR archives containing COFF object files):
|
||||||
|
|
||||||
|
```console
|
||||||
|
PS > C:\vcpkg\vcpkg.exe install --triplet x64-windows-static zlib
|
||||||
|
```
|
||||||
|
|
||||||
|
### Extract features via jh
|
||||||
|
|
||||||
|
[jh](https://github.com/williballenthin/lancelot/blob/master/bin/src/bin/jh.rs)
|
||||||
|
is a lancelot-based utility that parses AR archives containing COFF object files,
|
||||||
|
reconstructs their control flow, finds functions, and extracts features.
|
||||||
|
jh extracts numbers, API calls, and strings; we are only interested in the string features.
|
||||||
|
|
||||||
|
For each feature, jh emits a CSV line with the fields
|
||||||
|
- target triplet
|
||||||
|
- compiler
|
||||||
|
- library
|
||||||
|
- version
|
||||||
|
- build profile
|
||||||
|
- path
|
||||||
|
- function
|
||||||
|
- feature type
|
||||||
|
- feature value
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```csv
|
||||||
|
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0x00000100
|
||||||
|
```
|
||||||
|
|
||||||
|
For example, to invoke jh:
|
||||||
|
|
||||||
|
```console
|
||||||
|
$ ~/lancelot/target/release/jh x64-windows-static msvc143 zlib 1.2.13 release /mnt/c/vcpkg/installed/x64-windows-static/lib/zlib.lib > ~/flare-floss/floss/qs/db/data/oss/zlib.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
### Convert to OSS database format
|
||||||
|
|
||||||
|
We use the script `jh_to_qs.py` to convert these CSV lines into JSONL file prepared for FLOSS:
|
||||||
|
|
||||||
|
```console
|
||||||
|
$ python3 jh_to_qs.py zlib.csv > zlib.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
These files are then gzip'd:
|
||||||
|
|
||||||
|
```console
|
||||||
|
$ gzip -c zlib.jsonl > zlib.jsonl.gz
|
||||||
|
```
|
||||||
BIN
capa/analysis/strings/data/oss/sqlite3.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/sqlite3.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/tomcrypt.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/tomcrypt.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/wolfssl.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/wolfssl.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/zlib.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/zlib.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/winapi/apis.txt.gz
Normal file
BIN
capa/analysis/strings/data/winapi/apis.txt.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/winapi/dlls.txt.gz
Normal file
BIN
capa/analysis/strings/data/winapi/dlls.txt.gz
Normal file
Binary file not shown.
Reference in New Issue
Block a user