diff --git a/CHANGELOG.md b/CHANGELOG.md index ce457bf3..5c82e250 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ - binja: major performance improvement on the binja extractor. #1414 @xusheng6 - cape: make Process model flexible and procmemory optional to load newest reports #2466 @mr-tz - binja: fix unit test failure by fixing up the analysis for file al-khaser_x64.exe_ #2507 @xusheng6 +- binja: move the stack string detection to function level #2516 @xusheng6 ### capa Explorer Web diff --git a/capa/features/extractors/binja/basicblock.py b/capa/features/extractors/binja/basicblock.py index 5cb8ca13..2e47770b 100644 --- a/capa/features/extractors/binja/basicblock.py +++ b/capa/features/extractors/binja/basicblock.py @@ -5,111 +5,21 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. - -import string from typing import Iterator -from binaryninja import Function from binaryninja import BasicBlock as BinjaBasicBlock -from binaryninja import ( - BinaryView, - SymbolType, - RegisterValueType, - VariableSourceType, - MediumLevelILOperation, - MediumLevelILBasicBlock, - MediumLevelILInstruction, -) from capa.features.common import Feature, Characteristic from capa.features.address import Address from capa.features.basicblock import BasicBlock -from capa.features.extractors.helpers import MIN_STACKSTRING_LEN from capa.features.extractors.base_extractor import BBHandle, FunctionHandle -def get_printable_len_ascii(s: bytes) -> int: - """Return string length if all operand bytes are ascii or utf16-le printable""" - count = 0 - for c in s: - if c == 0: - return count - if c < 127 and chr(c) in string.printable: - count += 1 - return count - - -def get_printable_len_wide(s: bytes) -> int: - """Return string length if all operand bytes are ascii or utf16-le printable""" - if all(c == 0x00 for c in s[1::2]): - return get_printable_len_ascii(s[::2]) - return 0 - - -def get_stack_string_len(f: Function, il: MediumLevelILInstruction) -> int: - bv: BinaryView = f.view - - if il.operation != MediumLevelILOperation.MLIL_CALL: - return 0 - - target = il.dest - if target.operation not in [MediumLevelILOperation.MLIL_CONST, MediumLevelILOperation.MLIL_CONST_PTR]: - return 0 - - addr = target.value.value - sym = bv.get_symbol_at(addr) - if not sym or sym.type not in [SymbolType.LibraryFunctionSymbol, SymbolType.SymbolicFunctionSymbol]: - return 0 - - if sym.name not in ["__builtin_strncpy", "__builtin_strcpy", "__builtin_wcscpy"]: - return 0 - - if len(il.params) < 2: - return 0 - - dest = il.params[0] - if dest.operation in [MediumLevelILOperation.MLIL_ADDRESS_OF, MediumLevelILOperation.MLIL_VAR]: - var = dest.src - else: - return 0 - - if var.source_type != VariableSourceType.StackVariableSourceType: - return 0 - - src = il.params[1] - if src.value.type != RegisterValueType.ConstantDataAggregateValue: - return 0 - - s = f.get_constant_data(RegisterValueType.ConstantDataAggregateValue, src.value.value) - return max(get_printable_len_ascii(bytes(s)), get_printable_len_wide(bytes(s))) - - -def bb_contains_stackstring(f: Function, bb: MediumLevelILBasicBlock) -> bool: - """check basic block for stackstring indicators - - true if basic block contains enough moves of constant bytes to the stack - """ - count = 0 - for il in bb: - count += get_stack_string_len(f, il) - if count > MIN_STACKSTRING_LEN: - return True - - return False - - -def extract_bb_stackstring(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Feature, Address]]: - """extract stackstring indicators from basic block""" - bb: tuple[BinjaBasicBlock, MediumLevelILBasicBlock] = bbh.inner - if bb[1] is not None and bb_contains_stackstring(fh.inner, bb[1]): - yield Characteristic("stack string"), bbh.address - - def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Feature, Address]]: """extract tight loop indicators from a basic block""" - bb: tuple[BinjaBasicBlock, MediumLevelILBasicBlock] = bbh.inner - for edge in bb[0].outgoing_edges: - if edge.target.start == bb[0].start: + bb: BinjaBasicBlock = bbh.inner + for edge in bb.outgoing_edges: + if edge.target.start == bb.start: yield Characteristic("tight loop"), bbh.address @@ -121,7 +31,4 @@ def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Featur yield BasicBlock(), bbh.address -BASIC_BLOCK_HANDLERS = ( - extract_bb_tight_loop, - extract_bb_stackstring, -) +BASIC_BLOCK_HANDLERS = (extract_bb_tight_loop,) diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py index 1d4dd6bd..953cde76 100644 --- a/capa/features/extractors/binja/extractor.py +++ b/capa/features/extractors/binja/extractor.py @@ -8,7 +8,6 @@ from typing import Iterator import binaryninja as binja -from binaryninja import ILException import capa.features.extractors.elf import capa.features.extractors.binja.file @@ -54,23 +53,8 @@ class BinjaFeatureExtractor(StaticFeatureExtractor): def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]: f: binja.Function = fh.inner - # Set up a MLIL basic block dict look up to associate the disassembly basic block with its MLIL basic block - mlil_lookup = {} - try: - mlil = f.mlil - except ILException: - return - - if mlil is None: - return - - for mlil_bb in mlil.basic_blocks: - mlil_lookup[mlil_bb.source_block.start] = mlil_bb - for bb in f.basic_blocks: - mlil_bb = mlil_lookup.get(bb.start) - - yield BBHandle(address=AbsoluteVirtualAddress(bb.start), inner=(bb, mlil_bb)) + yield BBHandle(address=AbsoluteVirtualAddress(bb.start), inner=bb) def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Feature, Address]]: yield from capa.features.extractors.binja.basicblock.extract_features(fh, bbh) @@ -78,10 +62,10 @@ class BinjaFeatureExtractor(StaticFeatureExtractor): def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]: import capa.features.extractors.binja.helpers as binja_helpers - bb: tuple[binja.BasicBlock, binja.MediumLevelILBasicBlock] = bbh.inner - addr = bb[0].start + bb: binja.BasicBlock = bbh.inner + addr = bb.start - for text, length in bb[0]: + for text, length in bb: insn = binja_helpers.DisassemblyInstruction(addr, length, text) yield InsnHandle(address=AbsoluteVirtualAddress(addr), inner=insn) addr += length diff --git a/capa/features/extractors/binja/function.py b/capa/features/extractors/binja/function.py index 18973539..c7c017d1 100644 --- a/capa/features/extractors/binja/function.py +++ b/capa/features/extractors/binja/function.py @@ -5,14 +5,27 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import string from typing import Iterator -from binaryninja import Function, BinaryView, SymbolType, LowLevelILOperation +from binaryninja import ( + Function, + BinaryView, + SymbolType, + ILException, + RegisterValueType, + VariableSourceType, + LowLevelILOperation, + MediumLevelILOperation, + MediumLevelILBasicBlock, + MediumLevelILInstruction, +) from capa.features.file import FunctionName from capa.features.common import Feature, Characteristic from capa.features.address import Address, AbsoluteVirtualAddress from capa.features.extractors import loops +from capa.features.extractors.helpers import MIN_STACKSTRING_LEN from capa.features.extractors.binja.helpers import get_llil_instr_at_addr from capa.features.extractors.base_extractor import FunctionHandle @@ -95,10 +108,103 @@ def extract_function_name(fh: FunctionHandle): yield FunctionName(name[1:]), sym.address +def get_printable_len_ascii(s: bytes) -> int: + """Return string length if all operand bytes are ascii or utf16-le printable""" + count = 0 + for c in s: + if c == 0: + return count + if c < 127 and chr(c) in string.printable: + count += 1 + return count + + +def get_printable_len_wide(s: bytes) -> int: + """Return string length if all operand bytes are ascii or utf16-le printable""" + if all(c == 0x00 for c in s[1::2]): + return get_printable_len_ascii(s[::2]) + return 0 + + +def get_stack_string_len(f: Function, il: MediumLevelILInstruction) -> int: + bv: BinaryView = f.view + + if il.operation != MediumLevelILOperation.MLIL_CALL: + return 0 + + target = il.dest + if target.operation not in [MediumLevelILOperation.MLIL_CONST, MediumLevelILOperation.MLIL_CONST_PTR]: + return 0 + + addr = target.value.value + sym = bv.get_symbol_at(addr) + if not sym or sym.type not in [SymbolType.LibraryFunctionSymbol, SymbolType.SymbolicFunctionSymbol]: + return 0 + + if sym.name not in ["__builtin_strncpy", "__builtin_strcpy", "__builtin_wcscpy"]: + return 0 + + if len(il.params) < 2: + return 0 + + dest = il.params[0] + if dest.operation in [MediumLevelILOperation.MLIL_ADDRESS_OF, MediumLevelILOperation.MLIL_VAR]: + var = dest.src + else: + return 0 + + if var.source_type != VariableSourceType.StackVariableSourceType: + return 0 + + src = il.params[1] + if src.value.type != RegisterValueType.ConstantDataAggregateValue: + return 0 + + s = f.get_constant_data(RegisterValueType.ConstantDataAggregateValue, src.value.value) + return max(get_printable_len_ascii(bytes(s)), get_printable_len_wide(bytes(s))) + + +def bb_contains_stackstring(f: Function, bb: MediumLevelILBasicBlock) -> bool: + """check basic block for stackstring indicators + + true if basic block contains enough moves of constant bytes to the stack + """ + count = 0 + for il in bb: + count += get_stack_string_len(f, il) + if count > MIN_STACKSTRING_LEN: + return True + + return False + + +def extract_stackstring(fh: FunctionHandle): + """extract stackstring indicators""" + func: Function = fh.inner + bv: BinaryView = func.view + if bv is None: + return + + try: + mlil = func.mlil + except ILException: + return + + for block in mlil.basic_blocks: + if bb_contains_stackstring(func, block): + yield Characteristic("stack string"), block.source_block.start + + def extract_features(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]: for func_handler in FUNCTION_HANDLERS: for feature, addr in func_handler(fh): yield feature, addr -FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call, extract_function_name) +FUNCTION_HANDLERS = ( + extract_function_calls_to, + extract_function_loop, + extract_recursive_call, + extract_function_name, + extract_stackstring, +) diff --git a/capa/features/extractors/binja/insn.py b/capa/features/extractors/binja/insn.py index 7ebbb6d7..90be9a55 100644 --- a/capa/features/extractors/binja/insn.py +++ b/capa/features/extractors/binja/insn.py @@ -359,7 +359,7 @@ def extract_insn_nzxor_characteristic_features( # e.g., , (LLIL_SET_REG). So we do not need to check whether the two operands are the same. if il.operation == LowLevelILOperation.LLIL_XOR: # Exclude cases related to the stack cookie - if is_nzxor_stack_cookie(fh.inner, bbh.inner[0], il): + if is_nzxor_stack_cookie(fh.inner, bbh.inner, il): return False results.append((Characteristic("nzxor"), ih.address)) return False