Compare commits

...

6 Commits

Author SHA1 Message Date
Willi Ballenthin
99daa6370d binary ninja: fix computation of call graph 2024-11-27 12:28:02 +00:00
Willi Ballenthin
61e1684783 binary ninja: use function.callers to compute call graph 2024-11-27 12:18:38 +00:00
Willi Ballenthin
73f56f585c binja: provide llil to instruction handlers via ctx 2024-11-27 11:57:12 +00:00
Willi Ballenthin
319dbfe299 binja: compute call graph up front
for cache friendliness. see https://github.com/mandiant/capa/issues/2402
2024-11-27 11:11:20 +00:00
Willi Ballenthin
a909d022a6 loader: show duration of binary ninja loading 2024-11-27 11:10:11 +00:00
Willi Ballenthin
999f91baa1 perf: add timing ctxmgr for logging duration of a block 2024-11-27 10:59:24 +00:00
6 changed files with 135 additions and 110 deletions

View File

@@ -5,15 +5,19 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging
from typing import Iterator
from collections import defaultdict
import binaryninja as binja
from binaryninja import ILException
from binaryninja import Function, BinaryView, SymbolType, ILException, RegisterValueType, LowLevelILOperation
import capa.perf
import capa.features.extractors.elf
import capa.features.extractors.binja.file
import capa.features.extractors.binja.insn
import capa.features.extractors.binja.global_
import capa.features.extractors.binja.helpers
import capa.features.extractors.binja.function
import capa.features.extractors.binja.basicblock
from capa.features.common import Feature
@@ -26,6 +30,8 @@ from capa.features.extractors.base_extractor import (
StaticFeatureExtractor,
)
logger = logging.getLogger(__name__)
class BinjaFeatureExtractor(StaticFeatureExtractor):
def __init__(self, bv: binja.BinaryView):
@@ -36,6 +42,9 @@ class BinjaFeatureExtractor(StaticFeatureExtractor):
self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv))
self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv))
with capa.perf.timing("binary ninja: computing call graph"):
self._call_graph = self._build_call_graph()
def get_base_address(self):
return AbsoluteVirtualAddress(self.bv.start)
@@ -45,9 +54,27 @@ class BinjaFeatureExtractor(StaticFeatureExtractor):
def extract_file_features(self):
yield from capa.features.extractors.binja.file.extract_features(self.bv)
def _build_call_graph(self):
# from function address to function addresses
calls_from: defaultdict[int, set[int]] = defaultdict(set)
calls_to: defaultdict[int, set[int]] = defaultdict(set)
f: Function
for f in self.bv.functions:
for caller in f.callers:
calls_from[caller.start].add(f.start)
calls_to[f.start].add(caller.start)
call_graph = {
"calls_to": calls_to,
"calls_from": calls_from,
}
return call_graph
def get_functions(self) -> Iterator[FunctionHandle]:
for f in self.bv.functions:
yield FunctionHandle(address=AbsoluteVirtualAddress(f.start), inner=f)
yield FunctionHandle(address=AbsoluteVirtualAddress(f.start), inner=f, ctx={"call_graph": self._call_graph})
def extract_function_features(self, fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]:
yield from capa.features.extractors.binja.function.extract_features(fh)
@@ -76,13 +103,16 @@ class BinjaFeatureExtractor(StaticFeatureExtractor):
yield from capa.features.extractors.binja.basicblock.extract_features(fh, bbh)
def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
import capa.features.extractors.binja.helpers as binja_helpers
f: binja.Function = fh.inner
bb: tuple[binja.BasicBlock, binja.MediumLevelILBasicBlock] = bbh.inner
addr = bb[0].start
bb: binja.BasicBlock
mlbb: binja.MediumLevelILBasicBlock
bb, mlbb = bbh.inner
for text, length in bb[0]:
insn = binja_helpers.DisassemblyInstruction(addr, length, text)
addr: int = bb.start
for text, length in bb:
llil = f.get_llils_at(addr)
insn = capa.features.extractors.binja.helpers.DisassemblyInstruction(addr, length, text, llil)
yield InsnHandle(address=AbsoluteVirtualAddress(addr), inner=insn)
addr += length

View File

@@ -7,7 +7,7 @@
# See the License for the specific language governing permissions and limitations under the License.
from typing import Iterator
from binaryninja import Function, BinaryView, SymbolType, ILException, RegisterValueType, LowLevelILOperation
from binaryninja import Function, BinaryView, SymbolType
from capa.features.file import FunctionName
from capa.features.common import Feature, Characteristic
@@ -20,38 +20,24 @@ def extract_function_calls_to(fh: FunctionHandle):
"""extract callers to a function"""
func: Function = fh.inner
for caller in func.caller_sites:
# Everything that is a code reference to the current function is considered a caller, which actually includes
# many other references that are NOT a caller. For example, an instruction `push function_start` will also be
# considered a caller to the function
llil = None
try:
# Temporary fix for https://github.com/Vector35/binaryninja-api/issues/6020. Since `.llil` can throw an
# exception rather than returning None
llil = caller.llil
except ILException:
caller: int
for caller in fh.ctx["call_graph"]["calls_to"].get(func.start, []):
if caller == func.start:
continue
if (llil is None) or llil.operation not in [
LowLevelILOperation.LLIL_CALL,
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
LowLevelILOperation.LLIL_JUMP,
LowLevelILOperation.LLIL_TAILCALL,
]:
yield Characteristic("calls to"), AbsoluteVirtualAddress(caller)
def extract_function_calls_from(fh: FunctionHandle):
"""extract callers from a function"""
func: Function = fh.inner
callee: int
for callee in fh.ctx["call_graph"]["calls_from"].get(func.start, []):
if callee == func.start:
continue
if llil.dest.value.type not in [
RegisterValueType.ImportedAddressValue,
RegisterValueType.ConstantValue,
RegisterValueType.ConstantPointerValue,
]:
continue
address = llil.dest.value.value
if address != func.start:
continue
yield Characteristic("calls to"), AbsoluteVirtualAddress(caller.address)
yield Characteristic("calls from"), AbsoluteVirtualAddress(callee)
def extract_function_loop(fh: FunctionHandle):
@@ -72,13 +58,12 @@ def extract_function_loop(fh: FunctionHandle):
def extract_recursive_call(fh: FunctionHandle):
"""extract recursive function call"""
func: Function = fh.inner
bv: BinaryView = func.view
if bv is None:
return
for ref in bv.get_code_refs(func.start):
if ref.function == func:
caller: int
for caller in fh.ctx["call_graph"]["calls_to"].get(func.start, []):
if caller == func.start:
yield Characteristic("recursive call"), fh.address
return
def extract_function_name(fh: FunctionHandle):
@@ -108,4 +93,10 @@ def extract_features(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]:
yield feature, addr
FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call, extract_function_name)
FUNCTION_HANDLERS = (
extract_function_calls_to,
extract_function_calls_from,
extract_function_loop,
extract_recursive_call,
extract_function_name,
)

View File

@@ -9,7 +9,7 @@ import re
from typing import Callable
from dataclasses import dataclass
from binaryninja import BinaryView, LowLevelILInstruction
from binaryninja import BinaryView, LowLevelILOperation, LowLevelILInstruction
from binaryninja.architecture import InstructionTextToken
@@ -18,6 +18,24 @@ class DisassemblyInstruction:
address: int
length: int
text: list[InstructionTextToken]
llil: list[LowLevelILInstruction]
@property
def is_call(self):
if not self.llil:
return False
# TODO(williballenthin): when to use one vs many llil instructions
# https://github.com/Vector35/binaryninja-api/issues/6205
llil = self.llil[0]
if not llil:
return False
return llil.operation in [
LowLevelILOperation.LLIL_CALL,
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
LowLevelILOperation.LLIL_TAILCALL,
]
LLIL_VISITOR = Callable[[LowLevelILInstruction, LowLevelILInstruction, int], bool]

View File

@@ -23,7 +23,7 @@ from binaryninja import (
import capa.features.extractors.helpers
from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
from capa.features.common import MAX_BYTES_FEATURE_SIZE, Bytes, String, Feature, Characteristic
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.address import Address
from capa.features.extractors.binja.helpers import DisassemblyInstruction, visit_llil_exprs
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
@@ -32,6 +32,7 @@ from capa.features.extractors.base_extractor import BBHandle, InsnHandle, Functi
SECURITY_COOKIE_BYTES_DELTA = 0x40
# TODO: move this to call graph pass
# check if a function is a stub function to another function/symbol. The criteria is:
# 1. The function must only have one basic block
# 2. The function must only make one call/jump to another address
@@ -82,8 +83,9 @@ def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle)
"""
func: Function = fh.inner
bv: BinaryView = func.view
insn: DisassemblyInstruction = ih.inner
for llil in func.get_llils_at(ih.address):
for llil in insn.llil:
if llil.operation in [
LowLevelILOperation.LLIL_CALL,
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
@@ -138,10 +140,11 @@ def extract_insn_number_features(
example:
push 3136B0h ; dwControlCode
"""
func: Function = fh.inner
insn: DisassemblyInstruction = ih.inner
results: list[tuple[Any[Number, OperandNumber], Address]] = []
# TODO: try to move this out of line
def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool:
if il.operation == LowLevelILOperation.LLIL_LOAD:
return False
@@ -165,7 +168,7 @@ def extract_insn_number_features(
return False
for llil in func.get_llils_at(ih.address):
for llil in insn.llil:
visit_llil_exprs(llil, llil_checker)
yield from results
@@ -179,11 +182,11 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
"""
func: Function = fh.inner
bv: BinaryView = func.view
insn: DisassemblyInstruction = ih.inner
candidate_addrs = set()
llil = func.get_llil_at(ih.address)
if llil is None or llil.operation in [LowLevelILOperation.LLIL_CALL, LowLevelILOperation.LLIL_CALL_STACK_ADJUST]:
if insn.is_call:
return
for ref in bv.get_code_refs_from(ih.address):
@@ -205,7 +208,7 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
return True
for llil in func.get_llils_at(ih.address):
for llil in insn.llil:
visit_llil_exprs(llil, llil_checker)
for addr in candidate_addrs:
@@ -227,6 +230,7 @@ def extract_insn_string_features(
"""
func: Function = fh.inner
bv: BinaryView = func.view
insn: DisassemblyInstruction = ih.inner
candidate_addrs = set()
@@ -250,7 +254,7 @@ def extract_insn_string_features(
return True
for llil in func.get_llils_at(ih.address):
for llil in insn.llil:
visit_llil_exprs(llil, llil_checker)
# Now we have all the candidate address, check them for string or pointer to string
@@ -283,6 +287,7 @@ def extract_insn_offset_features(
.text:0040112F cmp [esi+4], ebx
"""
func: Function = fh.inner
insn: DisassemblyInstruction = ih.inner
results: list[tuple[Any[Offset, OperandOffset], Address]] = []
address_size = func.view.arch.address_size * 8
@@ -327,7 +332,7 @@ def extract_insn_offset_features(
return True
for llil in func.get_llils_at(ih.address):
for llil in insn.llil:
visit_llil_exprs(llil, llil_checker)
yield from results
@@ -367,7 +372,7 @@ def extract_insn_nzxor_characteristic_features(
parse instruction non-zeroing XOR instruction
ignore expected non-zeroing XORs, e.g. security cookies
"""
func: Function = fh.inner
insn: DisassemblyInstruction = ih.inner
results = []
@@ -383,7 +388,7 @@ def extract_insn_nzxor_characteristic_features(
else:
return True
for llil in func.get_llils_at(ih.address):
for llil in insn.llil:
visit_llil_exprs(llil, llil_checker)
yield from results
@@ -415,7 +420,7 @@ def extract_insn_peb_access_characteristic_features(
fs:[0x30] on x86, gs:[0x60] on x64
"""
func: Function = fh.inner
insn: DisassemblyInstruction = ih.inner
results = []
@@ -445,7 +450,7 @@ def extract_insn_peb_access_characteristic_features(
results.append((Characteristic("peb access"), ih.address))
return False
for llil in func.get_llils_at(ih.address):
for llil in insn.llil:
visit_llil_exprs(llil, llil_checker)
yield from results
@@ -455,7 +460,7 @@ def extract_insn_segment_access_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[tuple[Feature, Address]]:
"""parse instruction fs or gs access"""
func: Function = fh.inner
insn: DisassemblyInstruction = ih.inner
results = []
@@ -472,7 +477,7 @@ def extract_insn_segment_access_features(
return True
for llil in func.get_llils_at(ih.address):
for llil in insn.llil:
visit_llil_exprs(llil, llil_checker)
yield from results
@@ -500,47 +505,6 @@ def extract_insn_cross_section_cflow(
yield Characteristic("cross section flow"), ih.address
def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[tuple[Feature, Address]]:
"""extract functions calls from features
most relevant at the function scope, however, its most efficient to extract at the instruction scope
"""
func: Function = fh.inner
bv: BinaryView = func.view
if bv is None:
return
for il in func.get_llils_at(ih.address):
if il.operation not in [
LowLevelILOperation.LLIL_CALL,
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
LowLevelILOperation.LLIL_TAILCALL,
]:
continue
dest = il.dest
if dest.operation == LowLevelILOperation.LLIL_CONST_PTR:
value = dest.value.value
yield Characteristic("calls from"), AbsoluteVirtualAddress(value)
elif dest.operation == LowLevelILOperation.LLIL_CONST:
yield Characteristic("calls from"), AbsoluteVirtualAddress(dest.value)
elif dest.operation == LowLevelILOperation.LLIL_LOAD:
indirect_src = dest.src
if indirect_src.operation == LowLevelILOperation.LLIL_CONST_PTR:
value = indirect_src.value.value
yield Characteristic("calls from"), AbsoluteVirtualAddress(value)
elif indirect_src.operation == LowLevelILOperation.LLIL_CONST:
yield Characteristic("calls from"), AbsoluteVirtualAddress(indirect_src.value)
elif dest.operation == LowLevelILOperation.LLIL_REG:
if dest.value.type in [
RegisterValueType.ImportedAddressValue,
RegisterValueType.ConstantValue,
RegisterValueType.ConstantPointerValue,
]:
yield Characteristic("calls from"), AbsoluteVirtualAddress(dest.value.value)
def extract_function_indirect_call_characteristic_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[tuple[Feature, Address]]:
@@ -550,14 +514,15 @@ def extract_function_indirect_call_characteristic_features(
most relevant at the function or basic block scope;
however, its most efficient to extract at the instruction scope
"""
func: Function = fh.inner
insn: DisassemblyInstruction = ih.inner
llil = func.get_llil_at(ih.address)
if llil is None or llil.operation not in [
LowLevelILOperation.LLIL_CALL,
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
LowLevelILOperation.LLIL_TAILCALL,
]:
if not insn.is_call:
return
# TODO(williballenthin): when to use one vs many llil instructions
# https://github.com/Vector35/binaryninja-api/issues/6205
llil = insn.llil[0]
if not llil:
return
if llil.dest.operation in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]:
@@ -590,6 +555,5 @@ INSTRUCTION_HANDLERS = (
extract_insn_peb_access_characteristic_features,
extract_insn_cross_section_cflow,
extract_insn_segment_access_features,
extract_function_calls_from,
extract_function_indirect_call_characteristic_features,
)

View File

@@ -239,6 +239,7 @@ def get_extractor(
return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(input_path)
elif backend == BACKEND_BINJA:
import capa.perf as perf
import capa.features.extractors.binja.find_binja_api as finder
if not finder.has_binaryninja():
@@ -262,9 +263,10 @@ def get_extractor(
raise UnsupportedOSError()
with console.status("analyzing program...", spinner="dots"):
bv: binaryninja.BinaryView = binaryninja.load(str(input_path))
if bv is None:
raise RuntimeError(f"Binary Ninja cannot open file {input_path}")
with perf.timing("binary ninja: loading program"):
bv: binaryninja.BinaryView = binaryninja.load(str(input_path))
if bv is None:
raise RuntimeError(f"Binary Ninja cannot open file {input_path}")
return capa.features.extractors.binja.extractor.BinjaFeatureExtractor(bv)

View File

@@ -5,6 +5,9 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import time
import inspect
import contextlib
import collections
# this structure is unstable and may change before the next major release.
@@ -14,3 +17,20 @@ counters: collections.Counter[str] = collections.Counter()
def reset():
global counters
counters = collections.Counter()
@contextlib.contextmanager
def timing(msg: str):
"""log the given message start/stop and time taken, using the caller's `logger` instance."""
# stack:
# 0: here
# 1: contextlib
# 2: caller
caller = inspect.stack()[2]
caller_logger = caller.frame.f_globals.get("logger")
caller_logger.debug("%s...", msg)
t0 = time.time()
yield
t1 = time.time()
caller_logger.debug("%s done in %0.1fs.", msg, t1 - t0)