Merge branch 'master' into wb-proto

This commit is contained in:
Willi Ballenthin
2023-03-24 11:51:45 +01:00
committed by GitHub
24 changed files with 1481 additions and 28 deletions

View File

@@ -61,6 +61,7 @@ a = Analysis(
"qt5",
"pyqtwebengine",
"pyasn1",
"binaryninja",
],
)

View File

@@ -90,3 +90,37 @@ jobs:
run: pip install -e .[dev]
- name: Run tests
run: pytest -v tests/
binja-tests:
name: Binary Ninja tests for ${{ matrix.python-version }} on ${{ matrix.os }}
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
python-version: ["3.7", "3.11"]
steps:
- name: Checkout capa with submodules
uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
with:
submodules: recursive
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0
with:
python-version: ${{ matrix.python-version }}
- name: Install pyyaml
run: sudo apt-get install -y libyaml-dev
- name: Install capa
run: pip install -e .[dev]
- name: install Binary Ninja
env:
BN_SERIAL: ${{ secrets.BN_SERIAL }}
run: |
mkdir ./.github/binja
curl "https://raw.githubusercontent.com/Vector35/binaryninja-api/6812c97/scripts/download_headless.py" -o ./.github/binja/download_headless.py
python ./.github/binja/download_headless.py --serial $BN_SERIAL --output .github/binja/BinaryNinja-headless.zip
unzip .github/binja/BinaryNinja-headless.zip -d .github/binja/
python .github/binja/binaryninja/scripts/install_api.py --install-on-root --silent
- name: Run tests
env:
BN_LICENSE: ${{ secrets.BN_LICENSE }}
run: pytest -v tests/test_binja_features.py # explicitly refer to the binja tests for performance. other tests run above.

4
.gitignore vendored
View File

@@ -118,8 +118,12 @@ rule-linter-output.log
scripts/perf/*.txt
scripts/perf/*.svg
scripts/perf/*.zip
.direnv
.envrc
.DS_Store
*/.DS_Store
Pipfile
Pipfile.lock
/cache/
.github/binja/binaryninja

View File

@@ -5,6 +5,7 @@
### New Features
- add protobuf format for result documents #1219 @williballenthin @mr-tz
- extractor: add Binary Ninja feature extractor @xusheng6
- new cli flag `--os` to override auto-detected operating system for a sample @captainGeech42
### Breaking Changes

View File

@@ -0,0 +1,146 @@
# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import sys
import string
import struct
from typing import Tuple, Iterator
from binaryninja import Function
from binaryninja import BasicBlock as BinjaBasicBlock
from binaryninja import (
BinaryView,
VariableSourceType,
MediumLevelILSetVar,
MediumLevelILOperation,
MediumLevelILBasicBlock,
MediumLevelILInstruction,
)
from capa.features.common import Feature, Characteristic
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.basicblock import BasicBlock
from capa.features.extractors.helpers import MIN_STACKSTRING_LEN
from capa.features.extractors.base_extractor import BBHandle, FunctionHandle
def get_printable_len(il: MediumLevelILSetVar) -> int:
"""Return string length if all operand bytes are ascii or utf16-le printable"""
width = il.dest.type.width
value = il.src.value.value
if width == 1:
chars = struct.pack("<B", value & 0xFF)
elif width == 2:
chars = struct.pack("<H", value & 0xFFFF)
elif width == 4:
chars = struct.pack("<I", value & 0xFFFFFFFF)
elif width == 8:
chars = struct.pack("<Q", value & 0xFFFFFFFFFFFFFFFF)
else:
return 0
def is_printable_ascii(chars_: bytes):
return all(c < 127 and chr(c) in string.printable for c in chars_)
def is_printable_utf16le(chars_: bytes):
if all(c == 0x00 for c in chars_[1::2]):
return is_printable_ascii(chars_[::2])
if is_printable_ascii(chars):
return width
if is_printable_utf16le(chars):
return width // 2
return 0
def is_mov_imm_to_stack(il: MediumLevelILInstruction) -> bool:
"""verify instruction moves immediate onto stack"""
if il.operation != MediumLevelILOperation.MLIL_SET_VAR:
return False
if il.src.operation != MediumLevelILOperation.MLIL_CONST:
return False
if not il.dest.source_type == VariableSourceType.StackVariableSourceType:
return False
return True
def bb_contains_stackstring(f: Function, bb: MediumLevelILBasicBlock) -> bool:
"""check basic block for stackstring indicators
true if basic block contains enough moves of constant bytes to the stack
"""
count = 0
for il in bb:
if is_mov_imm_to_stack(il):
count += get_printable_len(il)
if count > MIN_STACKSTRING_LEN:
return True
return False
def extract_bb_stackstring(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
"""extract stackstring indicators from basic block"""
bb: Tuple[BinjaBasicBlock, MediumLevelILBasicBlock] = bbh.inner
if bb[1] is not None and bb_contains_stackstring(fh.inner, bb[1]):
yield Characteristic("stack string"), bbh.address
def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
"""extract tight loop indicators from a basic block"""
bb: Tuple[BinjaBasicBlock, MediumLevelILBasicBlock] = bbh.inner
for edge in bb[0].outgoing_edges:
if edge.target.start == bb[0].start:
yield Characteristic("tight loop"), bbh.address
def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
"""extract basic block features"""
for bb_handler in BASIC_BLOCK_HANDLERS:
for feature, addr in bb_handler(fh, bbh):
yield feature, addr
yield BasicBlock(), bbh.address
BASIC_BLOCK_HANDLERS = (
extract_bb_tight_loop,
extract_bb_stackstring,
)
def main():
if len(sys.argv) < 2:
return
from binaryninja import BinaryViewType
from capa.features.extractors.binja.extractor import BinjaFeatureExtractor
bv: BinaryView = BinaryViewType.get_view_of_file(sys.argv[1])
if bv is None:
return
features = []
extractor = BinjaFeatureExtractor(bv)
for fh in extractor.get_functions():
for bbh in extractor.get_basic_blocks(fh):
features.extend(list(extract_features(fh, bbh)))
import pprint
pprint.pprint(features)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,77 @@
# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
from typing import List, Tuple, Iterator
import binaryninja as binja
import capa.features.extractors.elf
import capa.features.extractors.binja.file
import capa.features.extractors.binja.insn
import capa.features.extractors.binja.global_
import capa.features.extractors.binja.function
import capa.features.extractors.binja.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor
class BinjaFeatureExtractor(FeatureExtractor):
def __init__(self, bv: binja.BinaryView):
super().__init__()
self.bv = bv
self.global_features: List[Tuple[Feature, Address]] = []
self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv))
self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv))
self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv))
def get_base_address(self):
return AbsoluteVirtualAddress(self.bv.start)
def extract_global_features(self):
yield from self.global_features
def extract_file_features(self):
yield from capa.features.extractors.binja.file.extract_features(self.bv)
def get_functions(self) -> Iterator[FunctionHandle]:
for f in self.bv.functions:
yield FunctionHandle(address=AbsoluteVirtualAddress(f.start), inner=f)
def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.binja.function.extract_features(fh)
def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
f: binja.Function = fh.inner
# Set up a MLIL basic block dict look up to associate the disassembly basic block with its MLIL basic block
mlil_lookup = {}
for mlil_bb in f.mlil.basic_blocks:
mlil_lookup[mlil_bb.source_block.start] = mlil_bb
for bb in f.basic_blocks:
mlil_bb = None
if bb.start in mlil_lookup:
mlil_bb = mlil_lookup[bb.start]
yield BBHandle(address=AbsoluteVirtualAddress(bb.start), inner=(bb, mlil_bb))
def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.binja.basicblock.extract_features(fh, bbh)
def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
import capa.features.extractors.binja.helpers as binja_helpers
bb: Tuple[binja.BasicBlock, binja.MediumLevelILBasicBlock] = bbh.inner
addr = bb[0].start
for text, length in bb[0]:
insn = binja_helpers.DisassemblyInstruction(addr, length, text)
yield InsnHandle(address=AbsoluteVirtualAddress(addr), inner=insn)
addr += length
def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle):
yield from capa.features.extractors.binja.insn.extract_features(fh, bbh, ih)

View File

@@ -0,0 +1,188 @@
# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import sys
import struct
from typing import Tuple, Iterator
from binaryninja import Symbol, Segment, BinaryView, SymbolType, SymbolBinding
import capa.features.extractors.common
import capa.features.extractors.helpers
import capa.features.extractors.strings
from capa.features.file import Export, Import, Section, FunctionName
from capa.features.common import FORMAT_PE, FORMAT_ELF, Format, String, Feature, Characteristic
from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress, AbsoluteVirtualAddress
from capa.features.extractors.binja.helpers import unmangle_c_name
def check_segment_for_pe(bv: BinaryView, seg: Segment) -> Iterator[Tuple[int, int]]:
"""check segment for embedded PE
adapted for binja from:
https://github.com/vivisect/vivisect/blob/7be4037b1cecc4551b397f840405a1fc606f9b53/PE/carve.py#L19
"""
mz_xor = [
(
capa.features.extractors.helpers.xor_static(b"MZ", i),
capa.features.extractors.helpers.xor_static(b"PE", i),
i,
)
for i in range(256)
]
todo = []
# If this is the first segment of the binary, skip the first bytes. Otherwise, there will always be a matched
# PE at the start of the binaryview.
start = seg.start
if bv.view_type == "PE" and start == bv.start:
start += 1
for mzx, pex, i in mz_xor:
for off, _ in bv.find_all_data(start, seg.end, mzx):
todo.append((off, mzx, pex, i))
while len(todo):
off, mzx, pex, i = todo.pop()
# The MZ header has one field we will check e_lfanew is at 0x3c
e_lfanew = off + 0x3C
if seg.end < (e_lfanew + 4):
continue
newoff = struct.unpack("<I", capa.features.extractors.helpers.xor_static(bv.read(e_lfanew, 4), i))[0]
peoff = off + newoff
if seg.end < (peoff + 2):
continue
if bv.read(peoff, 2) == pex:
yield off, i
def extract_file_embedded_pe(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]:
"""extract embedded PE features"""
for seg in bv.segments:
for ea, _ in check_segment_for_pe(bv, seg):
yield Characteristic("embedded pe"), FileOffsetAddress(ea)
def extract_file_export_names(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]:
"""extract function exports"""
for sym in bv.get_symbols_of_type(SymbolType.FunctionSymbol):
if sym.binding in [SymbolBinding.GlobalBinding, SymbolBinding.WeakBinding]:
name = sym.short_name
yield Export(name), AbsoluteVirtualAddress(sym.address)
unmangled_name = unmangle_c_name(name)
if name != unmangled_name:
yield Export(unmangled_name), AbsoluteVirtualAddress(sym.address)
def extract_file_import_names(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]:
"""extract function imports
1. imports by ordinal:
- modulename.#ordinal
2. imports by name, results in two features to support importname-only
matching:
- modulename.importname
- importname
"""
for sym in bv.get_symbols_of_type(SymbolType.ImportAddressSymbol):
lib_name = str(sym.namespace)
addr = AbsoluteVirtualAddress(sym.address)
for name in capa.features.extractors.helpers.generate_symbols(lib_name, sym.short_name):
yield Import(name), addr
ordinal = sym.ordinal
if ordinal != 0 and (lib_name != ""):
ordinal_name = "#%d" % (ordinal)
for name in capa.features.extractors.helpers.generate_symbols(lib_name, ordinal_name):
yield Import(name), addr
def extract_file_section_names(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]:
"""extract section names"""
for name, section in bv.sections.items():
yield Section(name), AbsoluteVirtualAddress(section.start)
def extract_file_strings(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]:
"""extract ASCII and UTF-16 LE strings"""
for s in bv.strings:
yield String(s.value), FileOffsetAddress(s.start)
def extract_file_function_names(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]:
"""
extract the names of statically-linked library functions.
"""
for sym_name in bv.symbols:
for sym in bv.symbols[sym_name]:
if sym.type == SymbolType.LibraryFunctionSymbol:
name = sym.short_name
yield FunctionName(name), sym.address
if name.startswith("_"):
# some linkers may prefix linked routines with a `_` to avoid name collisions.
# extract features for both the mangled and un-mangled representations.
# e.g. `_fwrite` -> `fwrite`
# see: https://stackoverflow.com/a/2628384/87207
yield FunctionName(name[1:]), sym.address
def extract_file_format(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]:
view_type = bv.view_type
if view_type in ["PE", "COFF"]:
yield Format(FORMAT_PE), NO_ADDRESS
elif view_type == "ELF":
yield Format(FORMAT_ELF), NO_ADDRESS
elif view_type == "Raw":
# no file type to return when processing a binary file, but we want to continue processing
return
else:
raise NotImplementedError("unexpected file format: %d" % view_type)
def extract_features(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]:
"""extract file features"""
for file_handler in FILE_HANDLERS:
for feature, addr in file_handler(bv):
yield feature, addr
FILE_HANDLERS = (
extract_file_export_names,
extract_file_import_names,
extract_file_strings,
extract_file_section_names,
extract_file_embedded_pe,
extract_file_function_names,
extract_file_format,
)
def main():
""" """
if len(sys.argv) < 2:
return
from binaryninja import BinaryViewType
bv: BinaryView = BinaryViewType.get_view_of_file(sys.argv[1])
if bv is None:
return
import pprint
pprint.pprint(list(extract_features(bv)))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,34 @@
# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import subprocess
# When the script gets executed as a standalone executable (via PyInstaller), `import binaryninja` does not work because
# we have excluded the binaryninja module in `pyinstaller.spec`. The trick here is to call the system Python and try
# to find out the path of the binaryninja module that has been installed.
# Note, including the binaryninja module in the `pyintaller.spec` would not work, since the binaryninja module tries to
# find the binaryninja core e.g., `libbinaryninjacore.dylib`, using a relative path. And this does not work when the
# binaryninja module is extracted by the PyInstaller.
code = r"""
from pathlib import Path
import importlib
spec = importlib.util.find_spec('binaryninja')
if spec is not None:
if len(spec.submodule_search_locations) > 0:
path = Path(spec.submodule_search_locations[0])
# encode the path with utf8 then convert to hex, make sure it can be read and restored properly
print(str(path.parent).encode('utf8').hex())
"""
def find_binja_path() -> str:
raw_output = subprocess.check_output(["python", "-c", "%s" % code]).decode("ascii").strip()
return bytes.fromhex(raw_output).decode("utf8")
if __name__ == "__main__":
print(find_binja_path())

View File

@@ -0,0 +1,97 @@
# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import sys
from typing import Tuple, Iterator
from binaryninja import Function, BinaryView, LowLevelILOperation
from capa.features.common import Feature, Characteristic
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors import loops
from capa.features.extractors.base_extractor import FunctionHandle
def extract_function_calls_to(fh: FunctionHandle):
"""extract callers to a function"""
func: Function = fh.inner
bv: BinaryView = func.view
for caller in func.caller_sites:
# Everything that is a code reference to the current function is considered a caller, which actually includes
# many other references that are NOT a caller. For example, an instruction `push function_start` will also be
# considered a caller to the function
if caller.llil.operation in [
LowLevelILOperation.LLIL_CALL,
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
LowLevelILOperation.LLIL_JUMP,
LowLevelILOperation.LLIL_TAILCALL,
]:
yield Characteristic("calls to"), AbsoluteVirtualAddress(caller.address)
def extract_function_loop(fh: FunctionHandle):
"""extract loop indicators from a function"""
func: Function = fh.inner
edges = []
# construct control flow graph
for bb in func.basic_blocks:
for edge in bb.outgoing_edges:
edges.append((bb.start, edge.target.start))
if loops.has_loop(edges):
yield Characteristic("loop"), fh.address
def extract_recursive_call(fh: FunctionHandle):
"""extract recursive function call"""
func: Function = fh.inner
bv: BinaryView = func.view
if bv is None:
return
for ref in bv.get_code_refs(func.start):
if ref.function == func:
yield Characteristic("recursive call"), fh.address
def extract_features(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
for func_handler in FUNCTION_HANDLERS:
for feature, addr in func_handler(fh):
yield feature, addr
FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call)
def main():
""" """
if len(sys.argv) < 2:
return
from binaryninja import BinaryViewType
from capa.features.extractors.binja.extractor import BinjaFeatureExtractor
bv: BinaryView = BinaryViewType.get_view_of_file(sys.argv[1])
if bv is None:
return
features = []
extractor = BinjaFeatureExtractor(bv)
for fh in extractor.get_functions():
features.extend(list(extract_features(fh)))
import pprint
pprint.pprint(features)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,55 @@
import logging
import contextlib
from typing import Tuple, Iterator
from binaryninja import BinaryView
import capa.features.extractors.elf
from capa.features.common import OS, OS_MACOS, ARCH_I386, ARCH_AMD64, OS_WINDOWS, Arch, Feature
from capa.features.address import NO_ADDRESS, Address
logger = logging.getLogger(__name__)
def extract_os(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]:
name = bv.platform.name
if "-" in name:
name = name.split("-")[0]
if name == "windows":
yield OS(OS_WINDOWS), NO_ADDRESS
elif name == "macos":
yield OS(OS_MACOS), NO_ADDRESS
elif name in ["linux", "freebsd", "decree"]:
yield OS(name), NO_ADDRESS
else:
# we likely end up here:
# 1. handling shellcode, or
# 2. handling a new file format (e.g. macho)
#
# for (1) we can't do much - its shellcode and all bets are off.
# we could maybe accept a further CLI argument to specify the OS,
# but i think this would be rarely used.
# rules that rely on OS conditions will fail to match on shellcode.
#
# for (2), this logic will need to be updated as the format is implemented.
logger.debug("unsupported file format: %s, will not guess OS", name)
return
def extract_arch(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]:
arch = bv.arch.name
if arch == "x86_64":
yield Arch(ARCH_AMD64), NO_ADDRESS
elif arch == "x86":
yield Arch(ARCH_I386), NO_ADDRESS
else:
# we likely end up here:
# 1. handling a new architecture (e.g. aarch64)
#
# for (1), this logic will need to be updated as the format is implemented.
logger.debug("unsupported architecture: %s", arch)
return

View File

@@ -0,0 +1,50 @@
# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import re
from typing import List, Callable
from dataclasses import dataclass
from binaryninja import LowLevelILInstruction
from binaryninja.architecture import InstructionTextToken
@dataclass
class DisassemblyInstruction:
address: int
length: int
text: List[InstructionTextToken]
LLIL_VISITOR = Callable[[LowLevelILInstruction, LowLevelILInstruction, int], bool]
def visit_llil_exprs(il: LowLevelILInstruction, func: LLIL_VISITOR):
# BN does not really support operand index at the disassembly level, so use the LLIL operand index as a substitute.
# Note, this is NOT always guaranteed to be the same as disassembly operand.
for i, op in enumerate(il.operands):
if isinstance(op, LowLevelILInstruction) and func(op, il, i):
visit_llil_exprs(op, func)
def unmangle_c_name(name: str) -> str:
# https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170#FormatC
# Possible variations for BaseThreadInitThunk:
# @BaseThreadInitThunk@12
# _BaseThreadInitThunk
# _BaseThreadInitThunk@12
# It is also possible for a function to have a `Stub` appended to its name:
# _lstrlenWStub@4
# A small optimization to avoid running the regex too many times
# TODO: this still increases the unit test execution time from 170s to 200s, should be able to accelerate it
if name[0] in ["@", "_"]:
match = re.match(r"^[@|_](.*?)(Stub)?(@\d+)?$", name)
if match:
return match.group(1)
return name

View File

@@ -0,0 +1,630 @@
# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import sys
from typing import Any, Dict, List, Tuple, Iterator, Optional
from binaryninja import Function
from binaryninja import BasicBlock as BinjaBasicBlock
from binaryninja import (
BinaryView,
ILRegister,
SymbolType,
BinaryReader,
RegisterValueType,
LowLevelILOperation,
LowLevelILInstruction,
InstructionTextTokenType,
)
import capa.features.extractors.helpers
from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.binja.helpers import DisassemblyInstruction, visit_llil_exprs
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
# security cookie checks may perform non-zeroing XORs, these are expected within a certain
# byte range within the first and returning basic blocks, this helps to reduce FP features
SECURITY_COOKIE_BYTES_DELTA = 0x40
# check if a function is a stub function to another function/symbol. The criteria is:
# 1. The function must only have one basic block
# 2. The function must only make one call/jump to another address
# If the function being checked is a stub function, returns the target address. Otherwise, return None.
def is_stub_function(bv: BinaryView, addr: int) -> Optional[int]:
funcs = bv.get_functions_at(addr)
for func in funcs:
if len(func.basic_blocks) != 1:
continue
call_count = 0
call_target = None
for il in func.llil.instructions:
if il.operation in [
LowLevelILOperation.LLIL_CALL,
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
LowLevelILOperation.LLIL_JUMP,
LowLevelILOperation.LLIL_TAILCALL,
]:
call_count += 1
if il.dest.value.type in [
RegisterValueType.ImportedAddressValue,
RegisterValueType.ConstantValue,
RegisterValueType.ConstantPointerValue,
]:
call_target = il.dest.value.value
if call_count == 1 and call_target is not None:
return call_target
return None
def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
"""
parse instruction API features
example:
call dword [0x00473038]
"""
insn: DisassemblyInstruction = ih.inner
func: Function = fh.inner
bv: BinaryView = func.view
for llil in func.get_llils_at(ih.address):
if llil.operation in [
LowLevelILOperation.LLIL_CALL,
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
LowLevelILOperation.LLIL_JUMP,
LowLevelILOperation.LLIL_TAILCALL,
]:
if llil.dest.value.type not in [
RegisterValueType.ImportedAddressValue,
RegisterValueType.ConstantValue,
RegisterValueType.ConstantPointerValue,
]:
continue
address = llil.dest.value.value
candidate_addrs = [address]
stub_addr = is_stub_function(bv, address)
if stub_addr is not None:
candidate_addrs.append(stub_addr)
for address in candidate_addrs:
sym = func.view.get_symbol_at(address)
if sym is None or sym.type not in [SymbolType.ImportAddressSymbol, SymbolType.ImportedFunctionSymbol]:
continue
sym_name = sym.short_name
lib_name = ""
import_lib = bv.lookup_imported_object_library(sym.address)
if import_lib is not None:
lib_name = import_lib[0].name
if lib_name.endswith(".dll"):
lib_name = lib_name[:-4]
elif lib_name.endswith(".so"):
lib_name = lib_name[:-3]
for name in capa.features.extractors.helpers.generate_symbols(lib_name, sym_name):
yield API(name), ih.address
if sym_name.startswith("_"):
for name in capa.features.extractors.helpers.generate_symbols(lib_name, sym_name[1:]):
yield API(name), ih.address
def extract_insn_number_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
"""
parse instruction number features
example:
push 3136B0h ; dwControlCode
"""
insn: DisassemblyInstruction = ih.inner
func: Function = fh.inner
bv: BinaryView = func.view
results: List[Tuple[Any[Number, OperandNumber], Address]] = []
address_size = func.view.arch.address_size * 8
def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool:
if il.operation == LowLevelILOperation.LLIL_LOAD:
return False
if il.operation not in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]:
return True
for op in parent.operands:
if isinstance(op, ILRegister) and op.name in ["esp", "ebp", "rsp", "rbp", "sp"]:
return False
elif isinstance(op, LowLevelILInstruction) and op.operation == LowLevelILOperation.LLIL_REG:
if op.src.name in ["esp", "ebp", "rsp", "rbp", "sp"]:
return False
raw_value = il.value.value
if parent.operation == LowLevelILOperation.LLIL_SUB:
raw_value = -raw_value
results.append((Number(raw_value), ih.address))
results.append((OperandNumber(index, raw_value), ih.address))
return False
for llil in func.get_llils_at(ih.address):
visit_llil_exprs(llil, llil_checker)
for result in results:
yield result
def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
"""
parse referenced byte sequences
example:
push offset iid_004118d4_IShellLinkA ; riid
"""
insn: DisassemblyInstruction = ih.inner
func: Function = fh.inner
bv: BinaryView = func.view
candidate_addrs = set()
llil = func.get_llil_at(ih.address)
if llil is None or llil.operation in [LowLevelILOperation.LLIL_CALL, LowLevelILOperation.LLIL_CALL_STACK_ADJUST]:
return
for ref in bv.get_code_refs_from(ih.address):
if ref == ih.address:
continue
if len(bv.get_functions_containing(ref)) > 0:
continue
candidate_addrs.add(ref)
# collect candidate address by enumerating all integers, https://github.com/Vector35/binaryninja-api/issues/3966
def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool:
if il.operation in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]:
value = il.value.value
if value > 0:
candidate_addrs.add(value)
return False
return True
for llil in func.get_llils_at(ih.address):
visit_llil_exprs(llil, llil_checker)
for addr in candidate_addrs:
extracted_bytes = bv.read(addr, MAX_BYTES_FEATURE_SIZE)
if extracted_bytes and not capa.features.extractors.helpers.all_zeros(extracted_bytes):
if bv.get_string_at(addr) is None:
# don't extract byte features for obvious strings
yield Bytes(extracted_bytes), ih.address
def extract_insn_string_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
"""
parse instruction string features
example:
push offset aAcr ; "ACR > "
"""
insn: DisassemblyInstruction = ih.inner
func: Function = fh.inner
bv: BinaryView = func.view
candidate_addrs = set()
# collect candidate address from code refs directly
for ref in bv.get_code_refs_from(ih.address):
if ref == ih.address:
continue
if len(bv.get_functions_containing(ref)) > 0:
continue
candidate_addrs.add(ref)
# collect candidate address by enumerating all integers, https://github.com/Vector35/binaryninja-api/issues/3966
def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool:
if il.operation in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]:
value = il.value.value
if value > 0:
candidate_addrs.add(value)
return False
return True
for llil in func.get_llils_at(ih.address):
visit_llil_exprs(llil, llil_checker)
# Now we have all the candidate address, check them for string or pointer to string
br = BinaryReader(bv)
for addr in candidate_addrs:
found = bv.get_string_at(addr)
if found:
yield String(found.value), ih.address
br.seek(addr)
pointer = None
if bv.arch.address_size == 4:
pointer = br.read32()
elif bv.arch.address_size == 8:
pointer = br.read64()
if pointer is not None:
found = bv.get_string_at(pointer)
if found:
yield String(found.value), ih.address
def extract_insn_offset_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
"""
parse instruction structure offset features
example:
.text:0040112F cmp [esi+4], ebx
"""
insn: DisassemblyInstruction = ih.inner
func: Function = fh.inner
results: List[Tuple[Any[Offset, OperandOffset], Address]] = []
address_size = func.view.arch.address_size * 8
def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool:
# The most common case, read/write dereference to something like `dword [eax+0x28]`
if il.operation in [LowLevelILOperation.LLIL_ADD, LowLevelILOperation.LLIL_SUB]:
left = il.left
right = il.right
# Exclude offsets based on stack/franme pointers
if left.operation == LowLevelILOperation.LLIL_REG and left.src.name in ["esp", "ebp", "rsp", "rbp", "sp"]:
return True
if right.operation != LowLevelILOperation.LLIL_CONST:
return True
raw_value = right.value.value
# If this is not a dereference, then this must be an add and the offset must be in the range \
# [0, MAX_STRUCTURE_SIZE]. For example,
# add eax, 0x10,
# lea ebx, [eax + 1]
if parent.operation not in [LowLevelILOperation.LLIL_LOAD, LowLevelILOperation.LLIL_STORE]:
if il.operation != LowLevelILOperation.LLIL_ADD or (not 0 < raw_value < MAX_STRUCTURE_SIZE):
return False
if address_size > 0:
# BN also encodes the constant value as two's complement, we need to restore its original value
value = capa.features.extractors.helpers.twos_complement(raw_value, address_size)
else:
value = raw_value
results.append((Offset(value), ih.address))
results.append((OperandOffset(index, value), ih.address))
return False
# An edge case: for code like `push dword [esi]`, we need to generate a feature for offset 0x0
elif il.operation in [LowLevelILOperation.LLIL_LOAD, LowLevelILOperation.LLIL_STORE]:
if il.operands[0].operation == LowLevelILOperation.LLIL_REG:
results.append((Offset(0), ih.address))
results.append((OperandOffset(index, 0), ih.address))
return False
return True
for llil in func.get_llils_at(ih.address):
visit_llil_exprs(llil, llil_checker)
for result in results:
yield result
def is_nzxor_stack_cookie(f: Function, bb: BinjaBasicBlock, llil: LowLevelILInstruction) -> bool:
"""check if nzxor exists within stack cookie delta"""
# TODO: we can do a much accurate analysi using LLIL SSA
reg_names = []
if llil.left.operation == LowLevelILOperation.LLIL_REG:
reg_names.append(llil.left.src.name)
if llil.right.operation == LowLevelILOperation.LLIL_REG:
reg_names.append(llil.right.src.name)
# stack cookie reg should be stack/frame pointer
if not any(reg in ["ebp", "esp", "rbp", "rsp", "sp"] for reg in reg_names):
return False
# expect security cookie init in first basic block within first bytes (instructions)
if len(bb.incoming_edges) == 0 and llil.address < (bb.start + SECURITY_COOKIE_BYTES_DELTA):
return True
# ... or within last bytes (instructions) before a return
if len(bb.outgoing_edges) == 0 and llil.address > (bb.end - SECURITY_COOKIE_BYTES_DELTA):
return True
return False
def extract_insn_nzxor_characteristic_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
"""
parse instruction non-zeroing XOR instruction
ignore expected non-zeroing XORs, e.g. security cookies
"""
insn: DisassemblyInstruction = ih.inner
func: Function = fh.inner
results = []
def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool:
# If the two operands of the xor instruction are the same, the LLIL will be translated to other instructions,
# e.g., <llil: eax = 0>, (LLIL_SET_REG). So we do not need to check whether the two operands are the same.
if il.operation == LowLevelILOperation.LLIL_XOR:
# Exclude cases related to the stack cookie
if is_nzxor_stack_cookie(fh.inner, bbh.inner[0], il):
return False
results.append((Characteristic("nzxor"), ih.address))
return False
else:
return True
for llil in func.get_llils_at(ih.address):
visit_llil_exprs(llil, llil_checker)
for result in results:
yield result
def extract_insn_mnemonic_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
"""parse instruction mnemonic features"""
insn: DisassemblyInstruction = ih.inner
yield Mnemonic(insn.text[0].text), ih.address
def extract_insn_obfs_call_plus_5_characteristic_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
"""
parse call $+5 instruction from the given instruction.
"""
insn: DisassemblyInstruction = ih.inner
if insn.text[0].text == "call" and insn.text[2].text == "$+5" and insn.length == 5:
yield Characteristic("call $+5"), ih.address
def extract_insn_peb_access_characteristic_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
"""parse instruction peb access
fs:[0x30] on x86, gs:[0x60] on x64
"""
insn: DisassemblyInstruction = ih.inner
func: Function = fh.inner
results = []
def llil_checker(il: LowLevelILInstruction, parent: LowLevelILOperation, index: int) -> bool:
if il.operation != LowLevelILOperation.LLIL_LOAD:
return True
src = il.src
if src.operation != LowLevelILOperation.LLIL_ADD:
return True
left = src.left
right = src.right
if left.operation != LowLevelILOperation.LLIL_REG:
return True
reg = left.src.name
if right.operation != LowLevelILOperation.LLIL_CONST:
return True
value = right.value.value
if not (reg, value) in (("fsbase", 0x30), ("gsbase", 0x60)):
return True
results.append((Characteristic("peb access"), ih.address))
return False
for llil in func.get_llils_at(ih.address):
visit_llil_exprs(llil, llil_checker)
for result in results:
yield result
def extract_insn_segment_access_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
"""parse instruction fs or gs access"""
insn: DisassemblyInstruction = ih.inner
func: Function = fh.inner
results = []
def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool:
if il.operation == LowLevelILOperation.LLIL_REG:
reg = il.src.name
if reg == "fsbase":
results.append((Characteristic("fs access"), ih.address))
return False
elif reg == "gsbase":
results.append((Characteristic("gs access"), ih.address))
return False
return False
return True
for llil in func.get_llils_at(ih.address):
visit_llil_exprs(llil, llil_checker)
for result in results:
yield result
def extract_insn_cross_section_cflow(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
"""inspect the instruction for a CALL or JMP that crosses section boundaries"""
insn: DisassemblyInstruction = ih.inner
func: Function = fh.inner
bv: BinaryView = func.view
if bv is None:
return
seg1 = bv.get_segment_at(ih.address)
sections1 = bv.get_sections_at(ih.address)
for ref in bv.get_code_refs_from(ih.address):
if len(bv.get_functions_at(ref)) == 0:
continue
seg2 = bv.get_segment_at(ref)
sections2 = bv.get_sections_at(ref)
if seg1 != seg2 or sections1 != sections2:
yield Characteristic("cross section flow"), ih.address
def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
"""extract functions calls from features
most relevant at the function scope, however, its most efficient to extract at the instruction scope
"""
insn: DisassemblyInstruction = ih.inner
func: Function = fh.inner
bv: BinaryView = func.view
if bv is None:
return
for il in func.get_llils_at(ih.address):
if il.operation not in [
LowLevelILOperation.LLIL_CALL,
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
LowLevelILOperation.LLIL_TAILCALL,
]:
continue
dest = il.dest
if dest.operation == LowLevelILOperation.LLIL_CONST_PTR:
value = dest.value.value
yield Characteristic("calls from"), AbsoluteVirtualAddress(value)
elif dest.operation == LowLevelILOperation.LLIL_CONST:
yield Characteristic("calls from"), AbsoluteVirtualAddress(dest.value)
elif dest.operation == LowLevelILOperation.LLIL_LOAD:
indirect_src = dest.src
if indirect_src.operation == LowLevelILOperation.LLIL_CONST_PTR:
value = indirect_src.value.value
yield Characteristic("calls from"), AbsoluteVirtualAddress(value)
elif indirect_src.operation == LowLevelILOperation.LLIL_CONST:
yield Characteristic("calls from"), AbsoluteVirtualAddress(indirect_src.value)
elif dest.operation == LowLevelILOperation.LLIL_REG:
if dest.value.type in [
RegisterValueType.ImportedAddressValue,
RegisterValueType.ConstantValue,
RegisterValueType.ConstantPointerValue,
]:
yield Characteristic("calls from"), AbsoluteVirtualAddress(dest.value.value)
def extract_function_indirect_call_characteristic_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
"""extract indirect function calls (e.g., call eax or call dword ptr [edx+4])
does not include calls like => call ds:dword_ABD4974
most relevant at the function or basic block scope;
however, its most efficient to extract at the instruction scope
"""
insn: DisassemblyInstruction = ih.inner
func: Function = fh.inner
llil = func.get_llil_at(ih.address)
if llil is None or llil.operation not in [
LowLevelILOperation.LLIL_CALL,
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
LowLevelILOperation.LLIL_TAILCALL,
]:
return
if llil.dest.operation in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]:
return
if llil.dest.operation == LowLevelILOperation.LLIL_LOAD:
src = llil.dest.src
if src.operation in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]:
return
yield Characteristic("indirect call"), ih.address
def extract_features(f: FunctionHandle, bbh: BBHandle, insn: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
"""extract instruction features"""
for inst_handler in INSTRUCTION_HANDLERS:
for feature, ea in inst_handler(f, bbh, insn):
yield feature, ea
INSTRUCTION_HANDLERS = (
extract_insn_api_features,
extract_insn_number_features,
extract_insn_bytes_features,
extract_insn_string_features,
extract_insn_offset_features,
extract_insn_nzxor_characteristic_features,
extract_insn_mnemonic_features,
extract_insn_obfs_call_plus_5_characteristic_features,
extract_insn_peb_access_characteristic_features,
extract_insn_cross_section_cflow,
extract_insn_segment_access_features,
extract_function_calls_from,
extract_function_indirect_call_characteristic_features,
)
def main():
""" """
if len(sys.argv) < 2:
return
from binaryninja import BinaryViewType
from capa.features.extractors.binja.extractor import BinjaFeatureExtractor
bv: BinaryView = BinaryViewType.get_view_of_file(sys.argv[1])
if bv is None:
return
features = []
extractor = BinjaFeatureExtractor(bv)
for fh in extractor.get_functions():
for bbh in extractor.get_basic_blocks(fh):
for insn in extractor.get_instructions(fh, bbh):
features.extend(list(extract_features(fh, bbh, insn)))
import pprint
pprint.pprint(features)
if __name__ == "__main__":
main()

View File

@@ -268,7 +268,8 @@ def dumps(extractor: capa.features.extractors.base_extractor.FeatureExtractor) -
basic_block=bbaddr,
address=Address.from_capa(addr),
feature=feature_from_capa(feature),
)
) # type: ignore
# Mypy is unable to recognise `basic_block` as a argument due to alias
for feature, addr in extractor.extract_basic_block_features(f, bb)
]
@@ -287,38 +288,41 @@ def dumps(extractor: capa.features.extractors.base_extractor.FeatureExtractor) -
instructions.append(
InstructionFeatures(
address=iaddr,
features=ifeatures,
features=tuple(ifeatures),
)
)
basic_blocks.append(
BasicBlockFeatures(
address=bbaddr,
features=bbfeatures,
instructions=instructions,
features=tuple(bbfeatures),
instructions=tuple(instructions),
)
)
function_features.append(
FunctionFeatures(
address=faddr,
features=ffeatures,
features=tuple(ffeatures),
basic_blocks=basic_blocks,
)
) # type: ignore
# Mypy is unable to recognise `basic_blocks` as a argument due to alias
)
features = Features(
global_=global_features,
file=file_features,
functions=function_features,
)
file=tuple(file_features),
functions=tuple(function_features),
) # type: ignore
# Mypy is unable to recognise `global_` as a argument due to alias
freeze = Freeze(
version=2,
base_address=Address.from_capa(extractor.get_base_address()),
extractor=Extractor(name=extractor.__class__.__name__),
features=features,
)
) # type: ignore
# Mypy is unable to recognise `base_address` as a argument due to alias
return freeze.json()

View File

@@ -101,59 +101,79 @@ class FeatureModel(BaseModel):
def feature_from_capa(f: capa.features.common.Feature) -> "Feature":
if isinstance(f, capa.features.common.OS):
assert isinstance(f.value, str)
return OSFeature(os=f.value, description=f.description)
elif isinstance(f, capa.features.common.Arch):
assert isinstance(f.value, str)
return ArchFeature(arch=f.value, description=f.description)
elif isinstance(f, capa.features.common.Format):
assert isinstance(f.value, str)
return FormatFeature(format=f.value, description=f.description)
elif isinstance(f, capa.features.common.MatchedRule):
assert isinstance(f.value, str)
return MatchFeature(match=f.value, description=f.description)
elif isinstance(f, capa.features.common.Characteristic):
assert isinstance(f.value, str)
return CharacteristicFeature(characteristic=f.value, description=f.description)
elif isinstance(f, capa.features.file.Export):
assert isinstance(f.value, str)
return ExportFeature(export=f.value, description=f.description)
elif isinstance(f, capa.features.file.Import):
return ImportFeature(import_=f.value, description=f.description)
assert isinstance(f.value, str)
return ImportFeature(import_=f.value, description=f.description) # type: ignore
# Mypy is unable to recognise `import_` as a argument due to alias
elif isinstance(f, capa.features.file.Section):
assert isinstance(f.value, str)
return SectionFeature(section=f.value, description=f.description)
elif isinstance(f, capa.features.file.FunctionName):
return FunctionNameFeature(function_name=f.value, description=f.description)
assert isinstance(f.value, str)
return FunctionNameFeature(function_name=f.value, description=f.description) # type: ignore
# Mypy is unable to recognise `function_name` as a argument due to alias
# must come before check for String due to inheritance
elif isinstance(f, capa.features.common.Substring):
assert isinstance(f.value, str)
return SubstringFeature(substring=f.value, description=f.description)
# must come before check for String due to inheritance
elif isinstance(f, capa.features.common.Regex):
assert isinstance(f.value, str)
return RegexFeature(regex=f.value, description=f.description)
elif isinstance(f, capa.features.common.String):
assert isinstance(f.value, str)
return StringFeature(string=f.value, description=f.description)
elif isinstance(f, capa.features.common.Class):
return ClassFeature(class_=f.value, description=f.description)
assert isinstance(f.value, str)
return ClassFeature(class_=f.value, description=f.description) # type: ignore
# Mypy is unable to recognise `class_` as a argument due to alias
elif isinstance(f, capa.features.common.Namespace):
assert isinstance(f.value, str)
return NamespaceFeature(namespace=f.value, description=f.description)
elif isinstance(f, capa.features.basicblock.BasicBlock):
return BasicBlockFeature(description=f.description)
elif isinstance(f, capa.features.insn.API):
assert isinstance(f.value, str)
return APIFeature(api=f.value, description=f.description)
elif isinstance(f, capa.features.insn.Property):
assert isinstance(f.value, str)
return PropertyFeature(property=f.value, access=f.access, description=f.description)
elif isinstance(f, capa.features.insn.Number):
assert isinstance(f.value, (int, float))
return NumberFeature(number=f.value, description=f.description)
elif isinstance(f, capa.features.common.Bytes):
@@ -162,16 +182,22 @@ def feature_from_capa(f: capa.features.common.Feature) -> "Feature":
return BytesFeature(bytes=binascii.hexlify(buf).decode("ascii"), description=f.description)
elif isinstance(f, capa.features.insn.Offset):
assert isinstance(f.value, int)
return OffsetFeature(offset=f.value, description=f.description)
elif isinstance(f, capa.features.insn.Mnemonic):
assert isinstance(f.value, str)
return MnemonicFeature(mnemonic=f.value, description=f.description)
elif isinstance(f, capa.features.insn.OperandNumber):
return OperandNumberFeature(index=f.index, operand_number=f.value, description=f.description)
assert isinstance(f.value, int)
return OperandNumberFeature(index=f.index, operand_number=f.value, description=f.description) # type: ignore
# Mypy is unable to recognise `operand_number` as a argument due to alias
elif isinstance(f, capa.features.insn.OperandOffset):
return OperandOffsetFeature(index=f.index, operand_offset=f.value, description=f.description)
assert isinstance(f.value, int)
return OperandOffsetFeature(index=f.index, operand_offset=f.value, description=f.description) # type: ignore
# Mypy is unable to recognise `operand_offset` as a argument due to alias
else:
raise NotImplementedError(f"feature_from_capa({type(f)}) not implemented")

View File

@@ -70,7 +70,7 @@ class Number(Feature):
elif isinstance(self.value, float):
return str(self.value)
else:
raise ValueError("invalid value type")
raise ValueError("invalid value type %s" % (type(self.value)))
# max recognized structure size (and therefore, offset size)

View File

@@ -77,6 +77,7 @@ RULES_PATH_DEFAULT_STRING = "(embedded rules)"
SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)"
BACKEND_VIV = "vivisect"
BACKEND_DOTNET = "dotnet"
BACKEND_BINJA = "binja"
E_MISSING_RULES = 10
E_MISSING_FILE = 11
@@ -523,6 +524,33 @@ def get_extractor(
return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path)
elif backend == BACKEND_BINJA:
from capa.features.extractors.binja.find_binja_api import find_binja_path
# When we are running as a standalone executable, we cannot directly import binaryninja
# We need to fist find the binja API installation path and add it into sys.path
if is_running_standalone():
bn_api = find_binja_path()
if os.path.exists(bn_api):
sys.path.append(bn_api)
try:
from binaryninja import BinaryView, BinaryViewType
except ImportError:
raise RuntimeError(
"Cannot import binaryninja module. Please install the Binary Ninja Python API first: "
"https://docs.binary.ninja/dev/batch.html#install-the-api)."
)
import capa.features.extractors.binja.extractor
with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
bv: BinaryView = BinaryViewType.get_view_of_file(path)
if bv is None:
raise RuntimeError("Binary Ninja cannot open file %s" % (path))
return capa.features.extractors.binja.extractor.BinjaFeatureExtractor(bv)
# default to use vivisect backend
else:
import capa.features.extractors.viv.extractor
@@ -873,7 +901,7 @@ def install_common_args(parser, wanted=None):
"--backend",
type=str,
help="select the backend to use",
choices=(BACKEND_VIV,),
choices=(BACKEND_VIV, BACKEND_BINJA),
default=BACKEND_VIV,
)

View File

@@ -354,9 +354,9 @@ class Match(FrozenModel):
return cls(
success=success,
node=node,
children=children,
locations=locations,
captures=captures,
children=tuple(children),
locations=tuple(locations),
captures={capture: tuple(captures[capture]) for capture in captures},
)
@@ -485,8 +485,8 @@ class RuleMetadata(FrozenModel):
namespace=rule.meta.get("namespace"),
authors=rule.meta.get("authors"),
scope=capa.rules.Scope(rule.meta.get("scope")),
attack=list(map(AttackSpec.from_str, rule.meta.get("att&ck", []))),
mbc=list(map(MBCSpec.from_str, rule.meta.get("mbc", []))),
attack=tuple(map(AttackSpec.from_str, rule.meta.get("att&ck", []))),
mbc=tuple(map(MBCSpec.from_str, rule.meta.get("mbc", []))),
references=rule.meta.get("references", []),
examples=rule.meta.get("examples", []),
description=rule.meta.get("description", ""),
@@ -498,8 +498,10 @@ class RuleMetadata(FrozenModel):
malware_family=rule.meta.get("maec/malware-family"),
malware_category=rule.meta.get("maec/malware-category"),
malware_category_ov=rule.meta.get("maec/malware-category-ov"),
),
)
), # type: ignore
# Mypy is unable to recognise arguments due to alias
) # type: ignore
# Mypy is unable to recognise arguments due to alias
class Config:
frozen = True

View File

@@ -126,6 +126,12 @@ Or install capa with build dependencies:
`$ pip install -e /local/path/to/src[build]`
#### Generate rule cache
Generate cache for all rules in the `rules` folder and save the output in the `cache` folder.
`$ python scripts/cache-ruleset.py rules/ cache/`
#### Run Pyinstaller
`$ pyinstaller .github/pyinstaller/pyinstaller.spec`

2
rules

Submodule rules updated: 8a2e23baa0...232af1ca4c

View File

@@ -76,7 +76,7 @@ setuptools.setup(
"pycodestyle==2.10.0",
"black==23.1.0",
"isort==5.11.4",
"mypy==1.0.1",
"mypy==1.1.1",
"psutil==5.9.2",
"stix2==3.0.1",
"requests==2.28.0",

View File

@@ -160,6 +160,29 @@ def get_dnfile_extractor(path):
return extractor
@lru_cache(maxsize=1)
def get_binja_extractor(path):
from binaryninja import Settings, BinaryViewType
import capa.features.extractors.binja.extractor
# Workaround for a BN bug: https://github.com/Vector35/binaryninja-api/issues/4051
settings = Settings()
if path.endswith("kernel32-64.dll_"):
old_pdb = settings.get_bool("pdb.loadGlobalSymbols")
settings.set_bool("pdb.loadGlobalSymbols", False)
bv = BinaryViewType.get_view_of_file(path)
if path.endswith("kernel32-64.dll_"):
settings.set_bool("pdb.loadGlobalSymbols", old_pdb)
extractor = capa.features.extractors.binja.extractor.BinjaFeatureExtractor(bv)
# overload the extractor so that the fixture exposes `extractor.path`
setattr(extractor, "path", path)
return extractor
def extract_global_features(extractor):
features = collections.defaultdict(set)
for feature, va in extractor.extract_global_features():
@@ -668,7 +691,7 @@ FEATURE_PRESENCE_TESTS = sorted(
("mimikatz", "function=0x46D534", capa.features.common.Characteristic("nzxor"), False),
# insn/characteristic(nzxor): xorps
# viv needs fixup to recognize function, see above
("3b13b...", "function=0x10006860", capa.features.common.Characteristic("nzxor"), True),
("mimikatz", "function=0x410dfc", capa.features.common.Characteristic("nzxor"), True),
# insn/characteristic(peb access)
("kernel32-64", "function=0x1800017D0", capa.features.common.Characteristic("peb access"), True),
("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("peb access"), False),

View File

@@ -0,0 +1,47 @@
# Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging
import fixtures
from fixtures import *
logger = logging.getLogger(__file__)
# We need to skip the binja test if we cannot import binaryninja, e.g., in GitHub CI.
binja_present: bool = False
try:
import binaryninja
try:
binaryninja.load(source=b"\x90")
except RuntimeError as e:
logger.warning("Binary Ninja license is not valid, provide via $BN_LICENSE or license.dat")
else:
binja_present = True
except ImportError:
pass
@pytest.mark.skipif(binja_present is False, reason="Skip binja tests if the binaryninja Python API is not installed")
@fixtures.parametrize(
"sample,scope,feature,expected",
fixtures.FEATURE_PRESENCE_TESTS,
indirect=["sample", "scope"],
)
def test_binja_features(sample, scope, feature, expected):
fixtures.do_test_feature_presence(fixtures.get_binja_extractor, sample, scope, feature, expected)
@pytest.mark.skipif(binja_present is False, reason="Skip binja tests if the binaryninja Python API is not installed")
@fixtures.parametrize(
"sample,scope,feature,expected",
fixtures.FEATURE_COUNT_TESTS,
indirect=["sample", "scope"],
)
def test_binja_feature_counts(sample, scope, feature, expected):
fixtures.do_test_feature_count(fixtures.get_binja_extractor, sample, scope, feature, expected)