From ad187fc3bdd72185cfa1bbf719f15ff2e53f16ec Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Fri, 11 Oct 2024 13:43:10 +0000 Subject: [PATCH] library detection: merge flirt and string branches --- capa/analysis/flirt.py | 12 +- capa/analysis/libraries.py | 199 +-------------------- capa/analysis/strings/__init__.py | 104 +++++++++-- capa/analysis/strings/__main__.py | 135 ++++++++++++++ capa/analysis/strings/data/oss/jh_to_qs.py | 52 ------ 5 files changed, 244 insertions(+), 258 deletions(-) create mode 100644 capa/analysis/strings/__main__.py delete mode 100644 capa/analysis/strings/data/oss/jh_to_qs.py diff --git a/capa/analysis/flirt.py b/capa/analysis/flirt.py index fc3e6a37..afc54af4 100644 --- a/capa/analysis/flirt.py +++ b/capa/analysis/flirt.py @@ -1,3 +1,11 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + import io import sys import time @@ -6,6 +14,7 @@ import argparse from pathlib import Path import rich +from pydantic import BaseModel from rich.console import Console from rich.logging import RichHandler @@ -27,9 +36,6 @@ import ida_funcs logger = logging.getLogger(__name__) -from pydantic import BaseModel - - def colorbool(v: bool) -> str: if v: return f"[green]{str(v)}[/green]" diff --git a/capa/analysis/libraries.py b/capa/analysis/libraries.py index af087b86..12f6a4d3 100644 --- a/capa/analysis/libraries.py +++ b/capa/analysis/libraries.py @@ -1,193 +1,8 @@ -""" -further requirements: - - nltk -""" +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. -import sys -import logging -import collections -from pathlib import Path - -import rich -from rich.text import Text - -import capa.analysis.strings -import capa.features.extractors.strings -from capa.analysis.strings import LibraryStringDatabase - -logger = logging.getLogger(__name__) - - -def extract_strings(buf, n=4): - yield from capa.features.extractors.strings.extract_ascii_strings(buf, n=n) - yield from capa.features.extractors.strings.extract_unicode_strings(buf, n=n) - - -def prune_databases(dbs: list[LibraryStringDatabase], n=8): - """remove less trustyworthy database entries. - - such as: - - those found in multiple databases - - those that are English words - - those that are too short - - Windows API and DLL names - """ - - # TODO: consider applying these filters directly to the persisted databases, not at load time. - - winapi = capa.analysis.strings.WindowsApiStringDatabase.from_defaults() - - try: - from nltk.corpus import words as nltk_words - except ImportError: - # one-time download of dataset. - # this probably doesn't work well for embedded use. - import nltk - nltk.download("words") - from nltk.corpus import words as nltk_words - words = set(nltk_words.words()) - - counter = collections.Counter() - to_remove = set() - for db in dbs: - for string in db.metadata_by_string.keys(): - counter[string] += 1 - - if string in words: - to_remove.add(string) - continue - - if len(string) < n: - to_remove.add(string) - continue - - if string in winapi.api_names: - to_remove.add(string) - continue - - if string in winapi.dll_names: - to_remove.add(string) - continue - - for string, count in counter.most_common(): - if count <= 1: - break - - # remove strings that are seen in more than one database - to_remove.add(string) - - for db in dbs: - for string in to_remove: - if string in db.metadata_by_string: - del db.metadata_by_string[string] - - -def open_ida(input_path: Path): - import tempfile - - import idapro - - t = Path(tempfile.mkdtemp(prefix="ida-")) / input_path.name - t.write_bytes(input_path.read_bytes()) - # resource leak: we should delete this upon exit - - idapro.enable_console_messages(False) - idapro.open_database(str(t.absolute()), run_auto_analysis=True) - - import ida_auto - ida_auto.auto_wait() - - - -def main(): - logging.basicConfig(level=logging.DEBUG) - - # use n=8 to ignore common words - N = 8 - - input_path = Path(sys.argv[1]) - input_buf = input_path.read_bytes() - - dbs = capa.analysis.strings.get_default_databases() - prune_databases(dbs, n=N) - - strings_by_library = collections.defaultdict(set) - for string in extract_strings(input_path.read_bytes(), n=N): - for db in dbs: - if (metadata := db.metadata_by_string.get(string.s)): - strings_by_library[metadata.library_name].add(string.s) - - console = rich.get_console() - console.print(f"found libraries:", style="bold") - for library, strings in sorted(strings_by_library.items(), key=lambda p: len(p[1]), reverse=True): - console.print(f" - [b]{library}[/] ({len(strings)} strings)") - - for string in sorted(strings)[:10]: - console.print(f" - {string}", markup=False, style="grey37") - - if len(strings) > 10: - console.print(" ...", style="grey37") - - if not strings_by_library: - console.print(" (none)", style="grey37") - # since we're not going to find any strings - # return early and don't do IDA analysis - return - - # TODO: ensure there are XXX matches for each library, or ignore those entries - - open_ida(input_path) - - import idaapi - import idautils - import ida_funcs - import capa.features.extractors.ida.helpers as ida_helpers - - strings_by_function = collections.defaultdict(set) - for ea in idautils.Functions(): - f = idaapi.get_func(ea) - - # ignore library functions and thunk functions as identified by IDA - if f.flags & idaapi.FUNC_THUNK: - continue - if f.flags & idaapi.FUNC_LIB: - continue - - for bb in ida_helpers.get_function_blocks(f): - for insn in ida_helpers.get_instructions_in_range(bb.start_ea, bb.end_ea): - ref = capa.features.extractors.ida.helpers.find_data_reference_from_insn(insn) - if ref == insn.ea: - continue - - string = capa.features.extractors.ida.helpers.find_string_at(ref) - if not string: - continue - - for db in dbs: - if (metadata := db.metadata_by_string.get(string)): - strings_by_function[ea].add(string) - - # ensure there are at least XXX functions renamed, or ignore those entries - - console.print("functions:", style="bold") - for function, strings in sorted(strings_by_function.items()): - if strings: - name = ida_funcs.get_func_name(function) - - console.print(f" [b]{name}[/]@{function:08x}:") - - for string in strings: - for db in dbs: - if (metadata := db.metadata_by_string.get(string)): - location = Text(f"{metadata.library_name}@{metadata.library_version}::{metadata.function_name}", style="grey37") - console.print(" - ", location, ": ", string.rstrip()) - - # TODO: ensure there aren't conflicts among the matches - - console.print() - - console.print(f"found {len(strings_by_function)} library functions across {len(list(idautils.Functions()))} functions") - - -if __name__ == "__main__": - main() diff --git a/capa/analysis/strings/__init__.py b/capa/analysis/strings/__init__.py index 7721cd08..6d689b3c 100644 --- a/capa/analysis/strings/__init__.py +++ b/capa/analysis/strings/__init__.py @@ -1,10 +1,28 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +""" +further requirements: + - nltk +""" import gzip -import pathlib -from typing import Dict, Sequence +import logging +import collections +from typing import Dict +from pathlib import Path from dataclasses import dataclass import msgspec +import capa.features.extractors.strings + +logger = logging.getLogger(__name__) + class LibraryString(msgspec.Struct): string: str @@ -23,7 +41,7 @@ class LibraryStringDatabase: return len(self.metadata_by_string) @classmethod - def from_file(cls, path: pathlib.Path) -> "LibraryStringDatabase": + def from_file(cls, path: Path) -> "LibraryStringDatabase": metadata_by_string: Dict[str, LibraryString] = {} decoder = msgspec.json.Decoder(type=LibraryString) for line in gzip.decompress(path.read_bytes()).split(b"\n"): @@ -55,12 +73,12 @@ DEFAULT_FILENAMES = ( "zlib.jsonl.gz", ) -DEFAULT_PATHS = tuple( - pathlib.Path(__file__).parent / "data" / "oss" / filename for filename in DEFAULT_FILENAMES -) + (pathlib.Path(__file__).parent / "data" / "crt" / "msvc_v143.jsonl.gz",) +DEFAULT_PATHS = tuple(Path(__file__).parent / "data" / "oss" / filename for filename in DEFAULT_FILENAMES) + ( + Path(__file__).parent / "data" / "crt" / "msvc_v143.jsonl.gz", +) -def get_default_databases() -> Sequence[LibraryStringDatabase]: +def get_default_databases() -> list[LibraryStringDatabase]: return [LibraryStringDatabase.from_file(path) for path in DEFAULT_PATHS] @@ -73,9 +91,9 @@ class WindowsApiStringDatabase: return len(self.dll_names) + len(self.api_names) @classmethod - def from_dir(cls, path: pathlib.Path) -> "WindowsApiStringDatabase": - dll_names: Set[str] = set() - api_names: Set[str] = set() + def from_dir(cls, path: Path) -> "WindowsApiStringDatabase": + dll_names: set[str] = set() + api_names: set[str] = set() for line in gzip.decompress((path / "dlls.txt.gz").read_bytes()).decode("utf-8").splitlines(): if not line: @@ -91,5 +109,69 @@ class WindowsApiStringDatabase: @classmethod def from_defaults(cls) -> "WindowsApiStringDatabase": - return cls.from_dir(pathlib.Path(__file__).parent / "data" / "winapi") + return cls.from_dir(Path(__file__).parent / "data" / "winapi") + +def extract_strings(buf, n=4): + yield from capa.features.extractors.strings.extract_ascii_strings(buf, n=n) + yield from capa.features.extractors.strings.extract_unicode_strings(buf, n=n) + + +def prune_databases(dbs: list[LibraryStringDatabase], n=8): + """remove less trustyworthy database entries. + + such as: + - those found in multiple databases + - those that are English words + - those that are too short + - Windows API and DLL names + """ + + # TODO: consider applying these filters directly to the persisted databases, not at load time. + + winapi = WindowsApiStringDatabase.from_defaults() + + try: + from nltk.corpus import words as nltk_words + except ImportError: + # one-time download of dataset. + # this probably doesn't work well for embedded use. + import nltk + + nltk.download("words") + from nltk.corpus import words as nltk_words + words = set(nltk_words.words()) + + counter: collections.Counter[str] = collections.Counter() + to_remove = set() + for db in dbs: + for string in db.metadata_by_string.keys(): + counter[string] += 1 + + if string in words: + to_remove.add(string) + continue + + if len(string) < n: + to_remove.add(string) + continue + + if string in winapi.api_names: + to_remove.add(string) + continue + + if string in winapi.dll_names: + to_remove.add(string) + continue + + for string, count in counter.most_common(): + if count <= 1: + break + + # remove strings that are seen in more than one database + to_remove.add(string) + + for db in dbs: + for string in to_remove: + if string in db.metadata_by_string: + del db.metadata_by_string[string] diff --git a/capa/analysis/strings/__main__.py b/capa/analysis/strings/__main__.py new file mode 100644 index 00000000..d0f32134 --- /dev/null +++ b/capa/analysis/strings/__main__.py @@ -0,0 +1,135 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import sys +import logging +import collections +from pathlib import Path + +import rich +from rich.text import Text + +import capa.analysis.strings +import capa.features.extractors.strings + +logger = logging.getLogger(__name__) + + +def open_ida(input_path: Path): + import tempfile + + import idapro + + t = Path(tempfile.mkdtemp(prefix="ida-")) / input_path.name + t.write_bytes(input_path.read_bytes()) + # resource leak: we should delete this upon exit + + idapro.enable_console_messages(False) + idapro.open_database(str(t.absolute()), run_auto_analysis=True) + + import ida_auto + + ida_auto.auto_wait() + + +def main(): + logging.basicConfig(level=logging.DEBUG) + + # use n=8 to ignore common words + N = 8 + + input_path = Path(sys.argv[1]) + + dbs = capa.analysis.strings.get_default_databases() + capa.analysis.strings.prune_databases(dbs, n=N) + + strings_by_library = collections.defaultdict(set) + for string in capa.analysis.strings.extract_strings(input_path.read_bytes(), n=N): + for db in dbs: + if metadata := db.metadata_by_string.get(string.s): + strings_by_library[metadata.library_name].add(string.s) + + console = rich.get_console() + console.print("found libraries:", style="bold") + for library, strings in sorted(strings_by_library.items(), key=lambda p: len(p[1]), reverse=True): + console.print(f" - [b]{library}[/] ({len(strings)} strings)") + + for string in sorted(strings)[:10]: + console.print(f" - {string}", markup=False, style="grey37") + + if len(strings) > 10: + console.print(" ...", style="grey37") + + if not strings_by_library: + console.print(" (none)", style="grey37") + # since we're not going to find any strings + # return early and don't do IDA analysis + return + + # TODO: ensure there are XXX matches for each library, or ignore those entries + + open_ida(input_path) + + import idaapi + import idautils + import ida_funcs + + import capa.features.extractors.ida.helpers as ida_helpers + + strings_by_function = collections.defaultdict(set) + for ea in idautils.Functions(): + f = idaapi.get_func(ea) + + # ignore library functions and thunk functions as identified by IDA + if f.flags & idaapi.FUNC_THUNK: + continue + if f.flags & idaapi.FUNC_LIB: + continue + + for bb in ida_helpers.get_function_blocks(f): + for insn in ida_helpers.get_instructions_in_range(bb.start_ea, bb.end_ea): + ref = capa.features.extractors.ida.helpers.find_data_reference_from_insn(insn) + if ref == insn.ea: + continue + + string = capa.features.extractors.ida.helpers.find_string_at(ref) + if not string: + continue + + for db in dbs: + if metadata := db.metadata_by_string.get(string): + strings_by_function[ea].add(string) + + # ensure there are at least XXX functions renamed, or ignore those entries + + console.print("functions:", style="bold") + for function, strings in sorted(strings_by_function.items()): + if strings: + name = ida_funcs.get_func_name(function) + + console.print(f" [b]{name}[/]@{function:08x}:") + + for string in strings: + for db in dbs: + if metadata := db.metadata_by_string.get(string): + location = Text( + f"{metadata.library_name}@{metadata.library_version}::{metadata.function_name}", + style="grey37", + ) + console.print(" - ", location, ": ", string.rstrip()) + + # TODO: ensure there aren't conflicts among the matches + + console.print() + + console.print( + f"found {len(strings_by_function)} library functions across {len(list(idautils.Functions()))} functions" + ) + + +if __name__ == "__main__": + main() diff --git a/capa/analysis/strings/data/oss/jh_to_qs.py b/capa/analysis/strings/data/oss/jh_to_qs.py deleted file mode 100644 index 8c45a432..00000000 --- a/capa/analysis/strings/data/oss/jh_to_qs.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -convert from a jh CSV file to a .jsonl.gz OpenSourceString database. - -the jh file looks like: - - # triplet,compiler,library,version,profile,path,function,type,value - x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0x00000100 - x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0xfffffff8 - x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0xfffffffe - x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,api,BZ2_bzCompressInit - x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,api,handle_compress - x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0x0000fa90 - x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffff8 - x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffff9 - x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffffd - -jh is found here: https://github.com/williballenthin/lancelot/blob/master/bin/src/bin/jh.rs -""" -import sys -import json -import pathlib - -import msgspec - -from capa.analysis.strings import LibraryString - -p = pathlib.Path(sys.argv[1]) -for line in p.read_text().split("\n"): - if not line: - continue - - if line.startswith("#"): - continue - - triplet, compiler, library, version, profile, path, function, rest = line.split(",", 7) - type, _, value = rest.partition(",") - if type != "string": - continue - - if value.startswith('"'): - value = json.loads(value) - - s = LibraryString( - string=value, - library_name=library, - library_version=version, - file_path=path, - function_name=function, - ) - - sys.stdout.buffer.write(msgspec.json.encode(s)) - sys.stdout.buffer.write(b"\n")