mirror of
https://github.com/mandiant/capa.git
synced 2025-12-05 20:40:05 -08:00
Replace the header from source code files using the following script:
```Python
for dir_path, dir_names, file_names in os.walk("capa"):
for file_name in file_names:
# header are only in `.py` and `.toml` files
if file_name[-3:] not in (".py", "oml"):
continue
file_path = f"{dir_path}/{file_name}"
f = open(file_path, "rb+")
content = f.read()
m = re.search(OLD_HEADER, content)
if not m:
continue
print(f"{file_path}: {m.group('year')}")
content = content.replace(m.group(0), NEW_HEADER % m.group("year"))
f.seek(0)
f.write(content)
```
Some files had the copyright headers inside a `"""` comment and needed
manual changes before applying the script. `hook-vivisect.py` and
`pyinstaller.spec` didn't include the license in the header and also
needed manual changes.
The old header had the confusing sentence `All rights reserved`, which
does not make sense for an open source license. Replace the header by
the default Google header that corrects this issue and keep capa
consistent with other Google projects.
Adapt the linter to work with the new header.
Replace also the copyright text in the `web/public/index.html` file for
consistency.
610 lines
25 KiB
Python
610 lines
25 KiB
Python
# Copyright 2024 Google LLC
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
|
|
import re
|
|
import logging
|
|
from typing import Any
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
import fixtures
|
|
from google.protobuf.json_format import ParseDict
|
|
|
|
import capa.features.extractors.binexport2.helpers
|
|
from capa.features.extractors.binexport2.helpers import (
|
|
BinExport2InstructionPattern,
|
|
BinExport2InstructionPatternMatcher,
|
|
split_with_delimiters,
|
|
get_operand_expressions,
|
|
get_instruction_mnemonic,
|
|
get_instruction_operands,
|
|
get_operand_register_expression,
|
|
get_operand_immediate_expression,
|
|
)
|
|
from capa.features.extractors.binexport2.extractor import BinExport2FeatureExtractor
|
|
from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
|
|
from capa.features.extractors.binexport2.arch.arm.helpers import is_stack_register_expression
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
CD = Path(__file__).resolve().parent
|
|
|
|
|
|
# found via https://www.virustotal.com/gui/search/type%253Aelf%2520and%2520size%253A1.2kb%252B%2520and%2520size%253A1.4kb-%2520and%2520tag%253Aarm%2520and%2520not%2520tag%253Arelocatable%2520and%2520tag%253A64bits/files
|
|
# Ghidra disassembly of c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486
|
|
GHIDRA_DISASSEMBLY = """
|
|
//
|
|
// segment_1
|
|
// Loadable segment [0x200000 - 0x200157]
|
|
// ram:00200000-ram:00200157
|
|
//
|
|
00200000 7f 45 4c Elf64_Ehdr
|
|
...
|
|
//
|
|
// .text
|
|
// SHT_PROGBITS [0x210158 - 0x2101c7]
|
|
// ram:00210158-ram:002101c7
|
|
//
|
|
**************************************************************
|
|
* FUNCTION *
|
|
**************************************************************
|
|
undefined entry()
|
|
undefined w0:1 <RETURN>
|
|
_start XREF[4]: Entry Point(*), 00200018(*),
|
|
entry 002000c0(*),
|
|
_elfSectionHeaders::00000050(*)
|
|
00210158 20 00 80 d2 mov x0,#0x1
|
|
0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 = "Hello World!\n"
|
|
= 00000000002201C8h
|
|
00210160 c2 02 00 58 ldr x2,DAT_002101b8 = 000000000000000Eh
|
|
00210164 08 08 80 d2 mov x8,#0x40
|
|
00210168 01 00 00 d4 svc 0x0
|
|
0021016c a0 02 00 58 ldr x0=>$stringWith_Weird_Name,DAT_002101c0 = "This string has a very strang
|
|
= 00000000002201D6h
|
|
00210170 04 00 00 94 bl printString undefined printString()
|
|
00210174 60 0f 80 d2 mov x0,#0x7b
|
|
00210178 a8 0b 80 d2 mov x8,#0x5d
|
|
0021017c 01 00 00 d4 svc 0x0
|
|
**************************************************************
|
|
* FUNCTION *
|
|
**************************************************************
|
|
undefined printString()
|
|
undefined w0:1 <RETURN>
|
|
printString XREF[1]: entry:00210170(c)
|
|
00210180 01 00 80 d2 mov x1,#0x0
|
|
strlenLoop XREF[1]: 00210194(j)
|
|
00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ]
|
|
00210188 5f 00 00 71 cmp w2,#0x0
|
|
0021018c 60 00 00 54 b.eq strlenDone
|
|
00210190 21 04 00 91 add x1,x1,#0x1
|
|
00210194 fc ff ff 17 b strlenLoop
|
|
strlenDone XREF[1]: 0021018c(j)
|
|
00210198 e2 03 01 aa mov x2,x1
|
|
0021019c e1 03 00 aa mov x1,x0
|
|
002101a0 20 00 80 d2 mov x0,#0x1
|
|
002101a4 08 08 80 d2 mov x8,#0x40
|
|
002101a8 01 00 00 d4 svc 0x0
|
|
002101ac c0 03 5f d6 ret
|
|
DAT_002101b0 XREF[1]: entry:0021015c(R)
|
|
002101b0 c8 01 22 undefined8 00000000002201C8h ? -> 002201c8
|
|
00 00 00
|
|
00 00
|
|
DAT_002101b8 XREF[1]: entry:00210160(R)
|
|
002101b8 0e 00 00 undefined8 000000000000000Eh
|
|
00 00 00
|
|
00 00
|
|
DAT_002101c0 XREF[1]: entry:0021016c(R)
|
|
002101c0 d6 01 22 undefined8 00000000002201D6h ? -> 002201d6
|
|
00 00 00
|
|
00 00
|
|
//
|
|
// .data
|
|
// SHT_PROGBITS [0x2201c8 - 0x2201fb]
|
|
// ram:002201c8-ram:002201fb
|
|
//
|
|
helloWorldStr XREF[3]: 002000f8(*), entry:0021015c(*),
|
|
_elfSectionHeaders::00000090(*)
|
|
002201c8 48 65 6c ds "Hello World!\n"
|
|
6c 6f 20
|
|
57 6f 72
|
|
$stringWith_Weird_Name XREF[1]: entry:0021016c(*)
|
|
002201d6 54 68 69 ds "This string has a very strange label\n"
|
|
73 20 73
|
|
74 72 69
|
|
...
|
|
"""
|
|
|
|
|
|
def _parse_ghidra_disassembly(disasm: str) -> dict:
|
|
dd = {}
|
|
# 00210158 20 00 80 d2 mov x0,#0x1
|
|
# ^^^^^^^^ ^^^^^^^^^^^ ^^^ ^^ ^^^^
|
|
# address bytes mnemonic o1,o2 (,o3)
|
|
pattern = re.compile(
|
|
r"^( ){8}(?P<address>[0-9a-f]+) "
|
|
+ r"(?P<bytes>([0-9a-f]{2}[ ]){4})\s+"
|
|
+ r"(?P<mnemonic>[\w\.]+)\s*"
|
|
+ r"(?P<operand1>[\w#$=>]+)?,?"
|
|
+ r"((?P<operand2>[\w#$=>]+))?,?"
|
|
+ r"((?P<operand3>[\w#$=>]+))?"
|
|
)
|
|
for line in disasm.splitlines()[20:]:
|
|
m = pattern.match(line)
|
|
if m:
|
|
logger.debug("Match found\t%s\n\t\t\t\t%s", line, m.groupdict())
|
|
dd[int(m["address"], 0x10)] = {
|
|
"bytes": m["bytes"].strip(),
|
|
"mnemonic": m["mnemonic"],
|
|
"operands": [e for e in [m["operand1"], m["operand2"], m["operand3"]] if e is not None],
|
|
}
|
|
else:
|
|
logger.debug("No match\t%s", line)
|
|
return dd
|
|
|
|
|
|
BE2_EXTRACTOR = fixtures.get_binexport_extractor(
|
|
CD
|
|
/ "data"
|
|
/ "binexport2"
|
|
/ "c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486.elf_.ghidra.BinExport"
|
|
)
|
|
PARSED_DISASM = _parse_ghidra_disassembly(GHIDRA_DISASSEMBLY)
|
|
|
|
|
|
def test_instruction_bytes():
|
|
# more a data sanity check here as we don't test our code
|
|
for addr, de in PARSED_DISASM.items():
|
|
insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
|
|
assert insn.raw_bytes == bytes.fromhex(de["bytes"])
|
|
|
|
|
|
def test_get_instruction_mnemonic():
|
|
for addr, de in PARSED_DISASM.items():
|
|
insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
|
|
assert get_instruction_mnemonic(BE2_EXTRACTOR.be2, insn) == de["mnemonic"]
|
|
|
|
|
|
def test_get_instruction_operands_count():
|
|
for addr, de in PARSED_DISASM.items():
|
|
insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
|
|
ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn)
|
|
# this line is not properly parsed from the Ghidra disassembly using the current regex
|
|
# 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ]
|
|
if addr == 0x210184:
|
|
assert len(ops) == 2
|
|
else:
|
|
assert len(ops) == len(de["operands"])
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"addr,expressions",
|
|
[
|
|
# 00210158 20 00 80 d2 mov x0,#0x1
|
|
(
|
|
0x210158,
|
|
(
|
|
BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x0"),
|
|
BinExport2.Expression(type=BinExport2.Expression.IMMEDIATE_INT, immediate=0x1),
|
|
),
|
|
),
|
|
# 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0
|
|
(
|
|
0x21015C,
|
|
(
|
|
BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"),
|
|
BinExport2.Expression(
|
|
type=BinExport2.Expression.IMMEDIATE_INT, symbol="PTR_helloWorldStr_002101b0", immediate=0x2101B0
|
|
),
|
|
),
|
|
),
|
|
# 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ]
|
|
# ^^^ issue in Ghidra?
|
|
# IDA gives LDRB W2, [X0,X1]
|
|
(
|
|
0x210184,
|
|
(
|
|
BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="w2"),
|
|
(
|
|
BinExport2.Expression(type=BinExport2.Expression.DEREFERENCE, symbol="["),
|
|
BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x0"),
|
|
BinExport2.Expression(type=BinExport2.Expression.OPERATOR, symbol=","),
|
|
BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"),
|
|
BinExport2.Expression(type=BinExport2.Expression.DEREFERENCE, symbol="]"),
|
|
),
|
|
),
|
|
),
|
|
# 00210190 21 04 00 91 add x1,x1,#0x1
|
|
(
|
|
0x210190,
|
|
(
|
|
BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"),
|
|
BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"),
|
|
BinExport2.Expression(type=BinExport2.Expression.IMMEDIATE_INT, immediate=0x1),
|
|
),
|
|
),
|
|
],
|
|
)
|
|
def test_get_operand_expressions(addr, expressions):
|
|
insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
|
|
ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn)
|
|
for i, op in enumerate(ops):
|
|
op_expression = expressions[i]
|
|
exps = get_operand_expressions(BE2_EXTRACTOR.be2, op)
|
|
if len(exps) > 1:
|
|
for j, exp in enumerate(exps):
|
|
assert exp.type == op_expression[j].type
|
|
assert exp.symbol == op_expression[j].symbol
|
|
else:
|
|
assert len(exps) == 1
|
|
assert exps[0] == op_expression
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"addr,expressions",
|
|
[
|
|
# 00210158 20 00 80 d2 mov x0,#0x1
|
|
(0x210158, ("x0", None)),
|
|
# 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0
|
|
(0x21015C, ("x1", None)),
|
|
# 0021019c e1 03 00 aa mov x1,x0
|
|
(0x21019C, ("x1", "x0")),
|
|
# 00210190 21 04 00 91 add x1,x1,#0x1
|
|
(0x210190, ("x1", "x1", None)),
|
|
],
|
|
)
|
|
def test_get_operand_register_expression(addr, expressions):
|
|
insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
|
|
ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn)
|
|
for i, op in enumerate(ops):
|
|
reg_exp = get_operand_register_expression(BE2_EXTRACTOR.be2, op)
|
|
if reg_exp is None:
|
|
assert reg_exp == expressions[i]
|
|
else:
|
|
assert reg_exp.symbol == expressions[i]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"addr,expressions",
|
|
[
|
|
# 00210158 20 00 80 d2 mov x0,#0x1
|
|
(0x210158, (None, 0x1)),
|
|
# 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0
|
|
(0x21015C, (None, 0x2101B0)),
|
|
# 002101a8 01 00 00 d4 svc 0x0
|
|
(0x2101A8, (0x0,)),
|
|
# 00210190 21 04 00 91 add x1,x1,#0x1
|
|
(0x210190, (None, None, 0x1)),
|
|
],
|
|
)
|
|
def test_get_operand_immediate_expression(addr, expressions):
|
|
insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
|
|
ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn)
|
|
for i, op in enumerate(ops):
|
|
reg_exp = get_operand_immediate_expression(BE2_EXTRACTOR.be2, op)
|
|
if reg_exp is None:
|
|
assert reg_exp == expressions[i]
|
|
else:
|
|
assert reg_exp.immediate == expressions[i]
|
|
|
|
|
|
"""
|
|
mov x0, 0x20
|
|
bl 0x100
|
|
add x0, sp, 0x10
|
|
"""
|
|
BE2_DICT: dict[str, Any] = {
|
|
"expression": [
|
|
{"type": BinExport2.Expression.REGISTER, "symbol": "x0"},
|
|
{"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x20},
|
|
{"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x100},
|
|
{"type": BinExport2.Expression.REGISTER, "symbol": "sp"},
|
|
{"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x10},
|
|
],
|
|
# operand consists of 1 or more expressions, linked together as a tree
|
|
"operand": [
|
|
{"expression_index": [0]},
|
|
{"expression_index": [1]},
|
|
{"expression_index": [2]},
|
|
{"expression_index": [3]},
|
|
{"expression_index": [4]},
|
|
],
|
|
"mnemonic": [
|
|
{"name": "mov"}, # mnem 0
|
|
{"name": "bl"}, # mnem 1
|
|
{"name": "add"}, # mnem 2
|
|
],
|
|
# instruction may have 0 or more operands
|
|
"instruction": [
|
|
{"mnemonic_index": 0, "operand_index": [0, 1]},
|
|
{"mnemonic_index": 1, "operand_index": [2]},
|
|
{"mnemonic_index": 2, "operand_index": [0, 3, 4]},
|
|
],
|
|
}
|
|
BE2 = ParseDict(
|
|
BE2_DICT,
|
|
BinExport2(),
|
|
)
|
|
|
|
|
|
def test_is_stack_register_expression():
|
|
mov = ParseDict(BE2_DICT["instruction"][0], BinExport2.Instruction())
|
|
add = ParseDict(BE2_DICT["instruction"][2], BinExport2.Instruction())
|
|
|
|
mov_op0, mov_op1 = get_instruction_operands(BE2, mov)
|
|
op0_exp0 = get_operand_expressions(BE2, mov_op0)[0]
|
|
assert is_stack_register_expression(BE2, op0_exp0) is False
|
|
op0_exp1 = get_operand_expressions(BE2, mov_op1)[0]
|
|
assert is_stack_register_expression(BE2, op0_exp1) is False
|
|
|
|
add_op0, add_op1, add_op2 = get_instruction_operands(BE2, add)
|
|
op0_exp0 = get_operand_expressions(BE2, add_op0)[0]
|
|
assert is_stack_register_expression(BE2, op0_exp0) is False
|
|
op1_exp0 = get_operand_expressions(BE2, add_op1)[0]
|
|
assert is_stack_register_expression(BE2, op1_exp0) is True
|
|
op2_exp0 = get_operand_expressions(BE2, add_op2)[0]
|
|
assert is_stack_register_expression(BE2, op2_exp0) is False
|
|
|
|
|
|
def test_split_with_delimiters():
|
|
assert tuple(split_with_delimiters("abc|def", ("|",))) == ("abc", "|", "def")
|
|
assert tuple(split_with_delimiters("abc|def|", ("|",))) == ("abc", "|", "def", "|")
|
|
assert tuple(split_with_delimiters("abc||def", ("|",))) == ("abc", "|", "", "|", "def")
|
|
assert tuple(split_with_delimiters("abc|def-ghi", ("|", "-"))) == ("abc", "|", "def", "-", "ghi")
|
|
|
|
|
|
def test_pattern_parsing():
|
|
assert BinExport2InstructionPattern.from_str(
|
|
"br reg ; capture reg"
|
|
) == BinExport2InstructionPattern(mnemonics=("br",), operands=("reg",), capture="reg")
|
|
|
|
assert BinExport2InstructionPattern.from_str(
|
|
"mov reg0, reg1 ; capture reg0"
|
|
) == BinExport2InstructionPattern(mnemonics=("mov",), operands=("reg0", "reg1"), capture="reg0")
|
|
|
|
assert BinExport2InstructionPattern.from_str(
|
|
"adrp reg, #int ; capture #int"
|
|
) == BinExport2InstructionPattern(mnemonics=("adrp",), operands=("reg", "#int"), capture="#int")
|
|
|
|
assert BinExport2InstructionPattern.from_str(
|
|
"add reg, reg, #int ; capture #int"
|
|
) == BinExport2InstructionPattern(mnemonics=("add",), operands=("reg", "reg", "#int"), capture="#int")
|
|
|
|
assert BinExport2InstructionPattern.from_str(
|
|
"ldr reg0, [reg1] ; capture reg1"
|
|
) == BinExport2InstructionPattern(mnemonics=("ldr",), operands=("reg0", ("[", "reg1")), capture="reg1")
|
|
|
|
assert BinExport2InstructionPattern.from_str(
|
|
"ldr|str reg, [reg, #int] ; capture #int"
|
|
) == BinExport2InstructionPattern(
|
|
mnemonics=(
|
|
"ldr",
|
|
"str",
|
|
),
|
|
operands=("reg", ("[", "reg", ",", "#int")),
|
|
capture="#int",
|
|
)
|
|
|
|
assert BinExport2InstructionPattern.from_str(
|
|
"ldr|str reg, [reg, #int]! ; capture #int"
|
|
) == BinExport2InstructionPattern(
|
|
mnemonics=(
|
|
"ldr",
|
|
"str",
|
|
),
|
|
operands=("reg", ("!", "[", "reg", ",", "#int")),
|
|
capture="#int",
|
|
)
|
|
|
|
assert BinExport2InstructionPattern.from_str(
|
|
"ldr|str reg, [reg], #int ; capture #int"
|
|
) == BinExport2InstructionPattern(
|
|
mnemonics=(
|
|
"ldr",
|
|
"str",
|
|
),
|
|
operands=(
|
|
"reg",
|
|
(
|
|
"[",
|
|
"reg",
|
|
),
|
|
"#int",
|
|
),
|
|
capture="#int",
|
|
)
|
|
|
|
assert BinExport2InstructionPattern.from_str(
|
|
"ldp|stp reg, reg, [reg, #int] ; capture #int"
|
|
) == BinExport2InstructionPattern(
|
|
mnemonics=(
|
|
"ldp",
|
|
"stp",
|
|
),
|
|
operands=("reg", "reg", ("[", "reg", ",", "#int")),
|
|
capture="#int",
|
|
)
|
|
|
|
assert BinExport2InstructionPattern.from_str(
|
|
"ldp|stp reg, reg, [reg, #int]! ; capture #int"
|
|
) == BinExport2InstructionPattern(
|
|
mnemonics=(
|
|
"ldp",
|
|
"stp",
|
|
),
|
|
operands=("reg", "reg", ("!", "[", "reg", ",", "#int")),
|
|
capture="#int",
|
|
)
|
|
|
|
assert BinExport2InstructionPattern.from_str(
|
|
"ldp|stp reg, reg, [reg], #int ; capture #int"
|
|
) == BinExport2InstructionPattern(
|
|
mnemonics=(
|
|
"ldp",
|
|
"stp",
|
|
),
|
|
operands=("reg", "reg", ("[", "reg"), "#int"),
|
|
capture="#int",
|
|
)
|
|
|
|
assert (
|
|
BinExport2InstructionPatternMatcher.from_str(
|
|
"""
|
|
# comment
|
|
br reg
|
|
br reg(not-stack)
|
|
br reg ; capture reg
|
|
mov reg0, reg1 ; capture reg0
|
|
adrp reg, #int ; capture #int
|
|
add reg, reg, #int ; capture #int
|
|
ldr reg0, [reg1] ; capture reg1
|
|
ldr|str reg, [reg, #int] ; capture #int
|
|
ldr|str reg, [reg, #int]! ; capture #int
|
|
ldr|str reg, [reg], #int ; capture #int
|
|
ldp|stp reg, reg, [reg, #int] ; capture #int
|
|
ldp|stp reg, reg, [reg, #int]! ; capture #int
|
|
ldp|stp reg, reg, [reg], #int ; capture #int
|
|
ldrb reg0, [reg1, reg2] ; capture reg2
|
|
call [reg + reg * #int + #int]
|
|
call [reg + reg * #int]
|
|
call [reg * #int + #int]
|
|
call [reg + reg + #int]
|
|
call [reg + #int]
|
|
"""
|
|
).queries
|
|
is not None
|
|
)
|
|
|
|
|
|
def match_address(extractor: BinExport2FeatureExtractor, queries: BinExport2InstructionPatternMatcher, address: int):
|
|
instruction = extractor.idx.insn_by_address[address]
|
|
mnemonic: str = get_instruction_mnemonic(extractor.be2, instruction)
|
|
|
|
operands = []
|
|
for operand_index in instruction.operand_index:
|
|
operand = extractor.be2.operand[operand_index]
|
|
operands.append(capa.features.extractors.binexport2.helpers.get_operand_expressions(extractor.be2, operand))
|
|
|
|
return queries.match(mnemonic, operands)
|
|
|
|
|
|
def match_address_with_be2(
|
|
extractor: BinExport2FeatureExtractor, queries: BinExport2InstructionPatternMatcher, address: int
|
|
):
|
|
instruction_index = extractor.idx.insn_index_by_address[address]
|
|
return queries.match_with_be2(extractor.be2, instruction_index)
|
|
|
|
|
|
def test_pattern_matching():
|
|
queries = BinExport2InstructionPatternMatcher.from_str(
|
|
"""
|
|
br reg(stack) ; capture reg
|
|
br reg(not-stack) ; capture reg
|
|
mov reg0, reg1 ; capture reg0
|
|
adrp reg, #int ; capture #int
|
|
add reg, reg, #int ; capture #int
|
|
ldr reg0, [reg1] ; capture reg1
|
|
ldr|str reg, [reg, #int] ; capture #int
|
|
ldr|str reg, [reg, #int]! ; capture #int
|
|
ldr|str reg, [reg], #int ; capture #int
|
|
ldp|stp reg, reg, [reg, #int] ; capture #int
|
|
ldp|stp reg, reg, [reg, #int]! ; capture #int
|
|
ldp|stp reg, reg, [reg], #int ; capture #int
|
|
ldrb reg0, [reg1(not-stack), reg2] ; capture reg2
|
|
"""
|
|
)
|
|
|
|
# 0x210184: ldrb w2, [x0, x1]
|
|
# query: ldrb reg0, [reg1(not-stack), reg2] ; capture reg2"
|
|
assert match_address(BE2_EXTRACTOR, queries, 0x210184).expression.symbol == "x1"
|
|
assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210184).expression.symbol == "x1"
|
|
|
|
# 0x210198: mov x2, x1
|
|
# query: mov reg0, reg1 ; capture reg0"),
|
|
assert match_address(BE2_EXTRACTOR, queries, 0x210198).expression.symbol == "x2"
|
|
assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210198).expression.symbol == "x2"
|
|
|
|
# 0x210190: add x1, x1, 0x1
|
|
# query: add reg, reg, #int ; capture #int
|
|
assert match_address(BE2_EXTRACTOR, queries, 0x210190).expression.immediate == 1
|
|
assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210190).expression.immediate == 1
|
|
|
|
|
|
BE2_EXTRACTOR_687 = fixtures.get_binexport_extractor(
|
|
CD
|
|
/ "data"
|
|
/ "binexport2"
|
|
/ "687e79cde5b0ced75ac229465835054931f9ec438816f2827a8be5f3bd474929.elf_.ghidra.BinExport"
|
|
)
|
|
|
|
|
|
def test_pattern_matching_exclamation():
|
|
queries = BinExport2InstructionPatternMatcher.from_str(
|
|
"""
|
|
stp reg, reg, [reg, #int]! ; capture #int
|
|
"""
|
|
)
|
|
|
|
# note this captures the sp
|
|
# 0x107918: stp x20, x19, [sp,0xFFFFFFFFFFFFFFE0]!
|
|
# query: stp reg, reg, [reg, #int]! ; capture #int
|
|
assert match_address(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0
|
|
assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0
|
|
|
|
|
|
def test_pattern_matching_stack():
|
|
queries = BinExport2InstructionPatternMatcher.from_str(
|
|
"""
|
|
stp reg, reg, [reg(stack), #int]! ; capture #int
|
|
"""
|
|
)
|
|
|
|
# note this does capture the sp
|
|
# compare this with the test above (exclamation)
|
|
# 0x107918: stp x20, x19, [sp, 0xFFFFFFFFFFFFFFE0]!
|
|
# query: stp reg, reg, [reg(stack), #int]! ; capture #int
|
|
assert match_address(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0
|
|
assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0
|
|
|
|
|
|
def test_pattern_matching_not_stack():
|
|
queries = BinExport2InstructionPatternMatcher.from_str(
|
|
"""
|
|
stp reg, reg, [reg(not-stack), #int]! ; capture #int
|
|
"""
|
|
)
|
|
|
|
# note this does not capture the sp
|
|
# compare this with the test above (exclamation)
|
|
# 0x107918: stp x20, x19, [sp, 0xFFFFFFFFFFFFFFE0]!
|
|
# query: stp reg, reg, [reg(not-stack), #int]! ; capture #int
|
|
assert match_address(BE2_EXTRACTOR_687, queries, 0x107918) is None
|
|
assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918) is None
|
|
|
|
|
|
BE2_EXTRACTOR_MIMI = fixtures.get_binexport_extractor(CD / "data" / "binexport2" / "mimikatz.exe_.ghidra.BinExport")
|
|
|
|
|
|
def test_pattern_matching_x86():
|
|
queries = BinExport2InstructionPatternMatcher.from_str(
|
|
"""
|
|
cmp|lea reg, [reg(not-stack) + #int0] ; capture #int0
|
|
"""
|
|
)
|
|
|
|
# 0x4018c0: LEA ECX, [EBX+0x2]
|
|
# query: cmp|lea reg, [reg(not-stack) + #int0] ; capture #int0
|
|
assert match_address(BE2_EXTRACTOR_MIMI, queries, 0x4018C0).expression.immediate == 2
|
|
assert match_address_with_be2(BE2_EXTRACTOR_MIMI, queries, 0x4018C0).expression.immediate == 2
|