mirror of
https://github.com/mandiant/capa.git
synced 2025-12-31 15:06:17 -08:00
389 lines
13 KiB
Python
389 lines
13 KiB
Python
import sys
|
|
import logging
|
|
import sqlite3
|
|
import argparse
|
|
from typing import Optional
|
|
from pathlib import Path
|
|
from dataclasses import dataclass
|
|
|
|
import rich
|
|
import rich.table
|
|
import pefile
|
|
import lancelot
|
|
import lancelot.be2utils
|
|
import networkx as nx
|
|
from lancelot.be2utils import BinExport2Index,ReadMemoryError, AddressSpace
|
|
from lancelot.be2utils.binexport2_pb2 import BinExport2
|
|
|
|
import capa.main
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGraph.Vertex.Type.ValueType) -> bool:
|
|
return vertex.HasField("type") and vertex.type == type_
|
|
|
|
|
|
def is_vertex_thunk(vertex: BinExport2.CallGraph.Vertex) -> bool:
|
|
return is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.THUNK)
|
|
|
|
|
|
THUNK_CHAIN_DEPTH_DELTA = 5
|
|
|
|
|
|
def compute_thunks(be2: BinExport2, idx: BinExport2Index) -> dict[int, int]:
|
|
# from thunk address to target function address
|
|
thunks: dict[int, int] = {}
|
|
|
|
for addr, vertex_idx in idx.vertex_index_by_address.items():
|
|
vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_idx]
|
|
if not is_vertex_thunk(vertex):
|
|
continue
|
|
|
|
curr_vertex_idx: int = vertex_idx
|
|
for _ in range(THUNK_CHAIN_DEPTH_DELTA):
|
|
thunk_callees: list[int] = idx.callees_by_vertex_index[curr_vertex_idx]
|
|
# if this doesn't hold, then it doesn't seem like this is a thunk,
|
|
# because either, len is:
|
|
# 0 and the thunk doesn't point to anything, such as `jmp eax`, or
|
|
# >1 and the thunk may end up at many functions.
|
|
|
|
if not thunk_callees:
|
|
# maybe we have an indirect jump, like `jmp eax`
|
|
# that we can't actually resolve here.
|
|
break
|
|
|
|
assert len(thunk_callees) == 1, f"thunk @ {hex(addr)} failed"
|
|
|
|
thunked_vertex_idx: int = thunk_callees[0]
|
|
thunked_vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[thunked_vertex_idx]
|
|
|
|
if not is_vertex_thunk(thunked_vertex):
|
|
assert thunked_vertex.HasField("address")
|
|
|
|
thunks[addr] = thunked_vertex.address
|
|
break
|
|
|
|
curr_vertex_idx = thunked_vertex_idx
|
|
|
|
return thunks
|
|
|
|
|
|
def read_string(address_space: AddressSpace, address: int) -> Optional[str]:
|
|
try:
|
|
# if at end of segment then there might be an overrun here.
|
|
buf: bytes = address_space.read_memory(address, 0x100)
|
|
except ReadMemoryError:
|
|
logger.debug("failed to read memory: 0x%x", address)
|
|
return None
|
|
|
|
string: Optional[str] = None
|
|
|
|
# note: we *always* break after the first iteration
|
|
for s in capa.features.extractors.strings.extract_ascii_strings(buf):
|
|
if s.offset != 0:
|
|
break
|
|
|
|
return string
|
|
|
|
# note: we *always* break after the first iteration
|
|
for s in capa.features.extractors.strings.extract_unicode_strings(buf):
|
|
if s.offset != 0:
|
|
break
|
|
|
|
return string
|
|
|
|
return None
|
|
|
|
|
|
@dataclass
|
|
class AssemblageRow:
|
|
# from table: binaries
|
|
binary_id: int
|
|
file_name: str
|
|
platform: str
|
|
build_mode: str
|
|
toolset_version: str
|
|
github_url: str
|
|
optimization: str
|
|
repo_last_update: int
|
|
size: int
|
|
path: str
|
|
license: str
|
|
binary_hash: str
|
|
repo_commit_hash: str
|
|
# from table: functions
|
|
function_id: int
|
|
function_name: str
|
|
function_hash: str
|
|
top_comments: str
|
|
source_codes: str
|
|
prototype: str
|
|
_source_file: str
|
|
# from table: rvas
|
|
rva_id: int
|
|
start_rva: int
|
|
end_rva: int
|
|
|
|
@property
|
|
def source_file(self):
|
|
# cleanup some extra metadata provided by assemblage
|
|
return self._source_file.partition(" (MD5: ")[0].partition(" (0x3: ")[0]
|
|
|
|
|
|
class Assemblage:
|
|
conn: sqlite3.Connection
|
|
samples: Path
|
|
|
|
def __init__(self, db: Path, samples: Path):
|
|
super().__init__()
|
|
|
|
self.db = db
|
|
self.samples = samples
|
|
|
|
self.conn = sqlite3.connect(self.db)
|
|
with self.conn:
|
|
self.conn.executescript("""
|
|
PRAGMA journal_mode = WAL;
|
|
PRAGMA synchronous = NORMAL;
|
|
PRAGMA busy_timeout = 5000;
|
|
PRAGMA cache_size = -20000; -- 20MB
|
|
PRAGMA foreign_keys = true;
|
|
PRAGMA temp_store = memory;
|
|
|
|
BEGIN IMMEDIATE TRANSACTION;
|
|
CREATE INDEX IF NOT EXISTS idx__functions__binary_id ON functions (binary_id);
|
|
CREATE INDEX IF NOT EXISTS idx__rvas__function_id ON rvas (function_id);
|
|
|
|
CREATE VIEW IF NOT EXISTS assemblage AS
|
|
SELECT
|
|
binaries.id AS binary_id,
|
|
binaries.file_name AS file_name,
|
|
binaries.platform AS platform,
|
|
binaries.build_mode AS build_mode,
|
|
binaries.toolset_version AS toolset_version,
|
|
binaries.github_url AS github_url,
|
|
binaries.optimization AS optimization,
|
|
binaries.repo_last_update AS repo_last_update,
|
|
binaries.size AS size,
|
|
binaries.path AS path,
|
|
binaries.license AS license,
|
|
binaries.hash AS hash,
|
|
binaries.repo_commit_hash AS repo_commit_hash,
|
|
|
|
functions.id AS function_id,
|
|
functions.name AS function_name,
|
|
functions.hash AS function_hash,
|
|
functions.top_comments AS top_comments,
|
|
functions.source_codes AS source_codes,
|
|
functions.prototype AS prototype,
|
|
functions.source_file AS source_file,
|
|
|
|
rvas.id AS rva_id,
|
|
rvas.start AS start_rva,
|
|
rvas.end AS end_rva
|
|
FROM binaries
|
|
JOIN functions ON binaries.id = functions.binary_id
|
|
JOIN rvas ON functions.id = rvas.function_id;
|
|
""")
|
|
|
|
def get_row_by_binary_id(self, binary_id: int) -> AssemblageRow:
|
|
with self.conn:
|
|
cur = self.conn.execute("SELECT * FROM assemblage WHERE binary_id = ? LIMIT 1;", (binary_id, ))
|
|
return AssemblageRow(*cur.fetchone())
|
|
|
|
def get_rows_by_binary_id(self, binary_id: int) -> AssemblageRow:
|
|
with self.conn:
|
|
cur = self.conn.execute("SELECT * FROM assemblage WHERE binary_id = ?;", (binary_id, ))
|
|
row = cur.fetchone()
|
|
while row:
|
|
yield AssemblageRow(*row)
|
|
row = cur.fetchone()
|
|
|
|
def get_path_by_binary_id(self, binary_id: int) -> Path:
|
|
with self.conn:
|
|
cur = self.conn.execute("""SELECT path FROM assemblage WHERE binary_id = ? LIMIT 1""", (binary_id, ))
|
|
return self.samples / cur.fetchone()[0]
|
|
|
|
def get_pe_by_binary_id(self, binary_id: int) -> pefile.PE:
|
|
path = self.get_path_by_binary_id(binary_id)
|
|
return pefile.PE(data=path.read_bytes(), fast_load=True)
|
|
|
|
|
|
def main(argv=None):
|
|
if argv is None:
|
|
argv = sys.argv[1:]
|
|
|
|
parser = argparse.ArgumentParser(description="Inspect object boundaries in compiled programs")
|
|
capa.main.install_common_args(parser, wanted={})
|
|
parser.add_argument("assemblage_database", type=Path, help="path to Assemblage database")
|
|
parser.add_argument("assemblage_directory", type=Path, help="path to Assemblage samples directory")
|
|
parser.add_argument("binary_id", type=int, help="primary key of binary to inspect")
|
|
args = parser.parse_args(args=argv)
|
|
|
|
try:
|
|
capa.main.handle_common_args(args)
|
|
except capa.main.ShouldExitError as e:
|
|
return e.status_code
|
|
|
|
import logging
|
|
logging.getLogger("goblin.pe").setLevel(logging.WARNING)
|
|
|
|
if not args.assemblage_database.is_file():
|
|
raise ValueError("database doesn't exist")
|
|
|
|
db = Assemblage(args.assemblage_database, args.assemblage_directory)
|
|
|
|
@dataclass
|
|
class Function:
|
|
file: str
|
|
name: str
|
|
start_rva: int
|
|
end_rva: int
|
|
|
|
functions = [
|
|
Function(
|
|
file=m.source_file,
|
|
name=m.function_name,
|
|
start_rva=m.start_rva,
|
|
end_rva=m.end_rva,
|
|
)
|
|
for m in db.get_rows_by_binary_id(args.binary_id)
|
|
]
|
|
|
|
pe = db.get_pe_by_binary_id(args.binary_id)
|
|
base_address: int = pe.OPTIONAL_HEADER.ImageBase
|
|
|
|
pe_path = db.get_path_by_binary_id(args.binary_id)
|
|
be2: BinExport2 = lancelot.get_binexport2_from_bytes(
|
|
pe_path.read_bytes(),
|
|
function_hints=[
|
|
base_address + function.start_rva
|
|
for function in functions
|
|
]
|
|
)
|
|
|
|
idx = lancelot.be2utils.BinExport2Index(be2)
|
|
address_space = lancelot.be2utils.AddressSpace.from_pe(pe, base_address)
|
|
thunks = compute_thunks(be2, idx)
|
|
|
|
@dataclass(frozen=True, order=True)
|
|
class Node:
|
|
address: int
|
|
type: str
|
|
|
|
@dataclass(frozen=True, order=True)
|
|
class Edge:
|
|
source: int
|
|
destination: int
|
|
type: str
|
|
|
|
nodes: set[Node] = set()
|
|
edges: set[Edge] = set()
|
|
|
|
for flow_graph_index, flow_graph in enumerate(be2.flow_graph):
|
|
datas: set[int] = set()
|
|
callees: set[str] = set()
|
|
|
|
entry_basic_block_index: int = flow_graph.entry_basic_block_index
|
|
flow_graph_address: int = idx.get_basic_block_address(entry_basic_block_index)
|
|
|
|
for basic_block_index in flow_graph.basic_block_index:
|
|
basic_block: BinExport2.BasicBlock = be2.basic_block[basic_block_index]
|
|
|
|
for instruction_index, instruction, instruction_address in idx.basic_block_instructions(basic_block):
|
|
for addr in instruction.call_target:
|
|
addr = thunks.get(addr, addr)
|
|
|
|
if addr not in idx.vertex_index_by_address:
|
|
# disassembler did not define function at address
|
|
logger.debug("0x%x is not a vertex", addr)
|
|
continue
|
|
|
|
vertex_idx: int = idx.vertex_index_by_address[addr]
|
|
vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_idx]
|
|
|
|
callees.add(vertex.address)
|
|
|
|
for data_reference_index in idx.data_reference_index_by_source_instruction_index.get(instruction_index, []):
|
|
data_reference: BinExport2.DataReference = be2.data_reference[data_reference_index]
|
|
data_reference_address: int = data_reference.address
|
|
|
|
if data_reference_address in idx.insn_address_by_index:
|
|
# appears to be code
|
|
continue
|
|
|
|
datas.add(data_reference_address)
|
|
|
|
vertex_index = idx.vertex_index_by_address[flow_graph_address]
|
|
name = idx.get_function_name_by_vertex(vertex_index)
|
|
|
|
nodes.add(Node(
|
|
address=flow_graph_address,
|
|
type="function",
|
|
))
|
|
if datas or callees:
|
|
logger.info("%s @ 0x%X:", name, flow_graph_address)
|
|
|
|
for data in sorted(datas):
|
|
logger.info(" - 0x%X", data)
|
|
nodes.add(Node(
|
|
address=data,
|
|
type="data",
|
|
))
|
|
edges.add(Edge(
|
|
source=flow_graph_address,
|
|
destination=data,
|
|
type="data",
|
|
))
|
|
|
|
for callee in sorted(callees):
|
|
logger.info(" - %s", idx.get_function_name_by_address(callee))
|
|
nodes.add(Node(
|
|
address=callee,
|
|
type="function",
|
|
))
|
|
edges.add(Edge(
|
|
source=flow_graph_address,
|
|
destination=callee,
|
|
type="call",
|
|
))
|
|
|
|
else:
|
|
logger.info("%s @ 0x%X: (none)", name, flow_graph_address)
|
|
|
|
for section in pe.sections:
|
|
# within each section, emit a neighbor edge for each pair of neighbors.
|
|
|
|
section_nodes = [
|
|
node for node in nodes
|
|
if section.VirtualAddress + base_address <= node.address < base_address + section.VirtualAddress + section.Misc_VirtualSize
|
|
]
|
|
|
|
for i in range(1, len(section_nodes)):
|
|
a = section_nodes[i-1]
|
|
b = section_nodes[i]
|
|
|
|
edges.add(Edge(
|
|
type="neighbor",
|
|
source=a.address,
|
|
destination=b.address,
|
|
))
|
|
|
|
g = nx.MultiDiGraph()
|
|
|
|
for node in sorted(nodes):
|
|
g.add_node(node.address, type=node.type)
|
|
|
|
for edge in sorted(edges):
|
|
g.add_edge(edge.source, edge.destination, key=edge.type)
|
|
|
|
for line in nx.generate_gexf(g):
|
|
print(line)
|
|
|
|
# db.conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|