# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re import logging from typing import Any from pathlib import Path import pytest import fixtures from google.protobuf.json_format import ParseDict import capa.features.extractors.binexport2.helpers from capa.features.extractors.binexport2.helpers import ( BinExport2InstructionPattern, BinExport2InstructionPatternMatcher, split_with_delimiters, get_operand_expressions, get_instruction_mnemonic, get_instruction_operands, get_operand_register_expression, get_operand_immediate_expression, ) from capa.features.extractors.binexport2.extractor import BinExport2FeatureExtractor from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 from capa.features.extractors.binexport2.arch.arm.helpers import is_stack_register_expression logger = logging.getLogger(__name__) CD = Path(__file__).resolve().parent # found via https://www.virustotal.com/gui/search/type%253Aelf%2520and%2520size%253A1.2kb%252B%2520and%2520size%253A1.4kb-%2520and%2520tag%253Aarm%2520and%2520not%2520tag%253Arelocatable%2520and%2520tag%253A64bits/files # Ghidra disassembly of c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486 GHIDRA_DISASSEMBLY = """ // // segment_1 // Loadable segment [0x200000 - 0x200157] // ram:00200000-ram:00200157 // 00200000 7f 45 4c Elf64_Ehdr ... // // .text // SHT_PROGBITS [0x210158 - 0x2101c7] // ram:00210158-ram:002101c7 // ************************************************************** * FUNCTION * ************************************************************** undefined entry() undefined w0:1 _start XREF[4]: Entry Point(*), 00200018(*), entry 002000c0(*), _elfSectionHeaders::00000050(*) 00210158 20 00 80 d2 mov x0,#0x1 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 = "Hello World!\n" = 00000000002201C8h 00210160 c2 02 00 58 ldr x2,DAT_002101b8 = 000000000000000Eh 00210164 08 08 80 d2 mov x8,#0x40 00210168 01 00 00 d4 svc 0x0 0021016c a0 02 00 58 ldr x0=>$stringWith_Weird_Name,DAT_002101c0 = "This string has a very strang = 00000000002201D6h 00210170 04 00 00 94 bl printString undefined printString() 00210174 60 0f 80 d2 mov x0,#0x7b 00210178 a8 0b 80 d2 mov x8,#0x5d 0021017c 01 00 00 d4 svc 0x0 ************************************************************** * FUNCTION * ************************************************************** undefined printString() undefined w0:1 printString XREF[1]: entry:00210170(c) 00210180 01 00 80 d2 mov x1,#0x0 strlenLoop XREF[1]: 00210194(j) 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] 00210188 5f 00 00 71 cmp w2,#0x0 0021018c 60 00 00 54 b.eq strlenDone 00210190 21 04 00 91 add x1,x1,#0x1 00210194 fc ff ff 17 b strlenLoop strlenDone XREF[1]: 0021018c(j) 00210198 e2 03 01 aa mov x2,x1 0021019c e1 03 00 aa mov x1,x0 002101a0 20 00 80 d2 mov x0,#0x1 002101a4 08 08 80 d2 mov x8,#0x40 002101a8 01 00 00 d4 svc 0x0 002101ac c0 03 5f d6 ret DAT_002101b0 XREF[1]: entry:0021015c(R) 002101b0 c8 01 22 undefined8 00000000002201C8h ? -> 002201c8 00 00 00 00 00 DAT_002101b8 XREF[1]: entry:00210160(R) 002101b8 0e 00 00 undefined8 000000000000000Eh 00 00 00 00 00 DAT_002101c0 XREF[1]: entry:0021016c(R) 002101c0 d6 01 22 undefined8 00000000002201D6h ? -> 002201d6 00 00 00 00 00 // // .data // SHT_PROGBITS [0x2201c8 - 0x2201fb] // ram:002201c8-ram:002201fb // helloWorldStr XREF[3]: 002000f8(*), entry:0021015c(*), _elfSectionHeaders::00000090(*) 002201c8 48 65 6c ds "Hello World!\n" 6c 6f 20 57 6f 72 $stringWith_Weird_Name XREF[1]: entry:0021016c(*) 002201d6 54 68 69 ds "This string has a very strange label\n" 73 20 73 74 72 69 ... """ def _parse_ghidra_disassembly(disasm: str) -> dict: dd = {} # 00210158 20 00 80 d2 mov x0,#0x1 # ^^^^^^^^ ^^^^^^^^^^^ ^^^ ^^ ^^^^ # address bytes mnemonic o1,o2 (,o3) pattern = re.compile( r"^( ){8}(?P
[0-9a-f]+) " + r"(?P([0-9a-f]{2}[ ]){4})\s+" + r"(?P[\w\.]+)\s*" + r"(?P[\w#$=>]+)?,?" + r"((?P[\w#$=>]+))?,?" + r"((?P[\w#$=>]+))?" ) for line in disasm.splitlines()[20:]: m = pattern.match(line) if m: logger.debug("Match found\t%s\n\t\t\t\t%s", line, m.groupdict()) dd[int(m["address"], 0x10)] = { "bytes": m["bytes"].strip(), "mnemonic": m["mnemonic"], "operands": [e for e in [m["operand1"], m["operand2"], m["operand3"]] if e is not None], } else: logger.debug("No match\t%s", line) return dd BE2_EXTRACTOR = fixtures.get_binexport_extractor( CD / "data" / "binexport2" / "c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486.elf_.ghidra.BinExport" ) PARSED_DISASM = _parse_ghidra_disassembly(GHIDRA_DISASSEMBLY) def test_instruction_bytes(): # more a data sanity check here as we don't test our code for addr, de in PARSED_DISASM.items(): insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) assert insn.raw_bytes == bytes.fromhex(de["bytes"]) def test_get_instruction_mnemonic(): for addr, de in PARSED_DISASM.items(): insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) assert get_instruction_mnemonic(BE2_EXTRACTOR.be2, insn) == de["mnemonic"] def test_get_instruction_operands_count(): for addr, de in PARSED_DISASM.items(): insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) # this line is not properly parsed from the Ghidra disassembly using the current regex # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] if addr == 0x210184: assert len(ops) == 2 else: assert len(ops) == len(de["operands"]) @pytest.mark.parametrize( "addr,expressions", [ # 00210158 20 00 80 d2 mov x0,#0x1 ( 0x210158, ( BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x0"), BinExport2.Expression(type=BinExport2.Expression.IMMEDIATE_INT, immediate=0x1), ), ), # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 ( 0x21015C, ( BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), BinExport2.Expression( type=BinExport2.Expression.IMMEDIATE_INT, symbol="PTR_helloWorldStr_002101b0", immediate=0x2101B0 ), ), ), # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] # ^^^ issue in Ghidra? # IDA gives LDRB W2, [X0,X1] ( 0x210184, ( BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="w2"), ( BinExport2.Expression(type=BinExport2.Expression.DEREFERENCE, symbol="["), BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x0"), BinExport2.Expression(type=BinExport2.Expression.OPERATOR, symbol=","), BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), BinExport2.Expression(type=BinExport2.Expression.DEREFERENCE, symbol="]"), ), ), ), # 00210190 21 04 00 91 add x1,x1,#0x1 ( 0x210190, ( BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), BinExport2.Expression(type=BinExport2.Expression.IMMEDIATE_INT, immediate=0x1), ), ), ], ) def test_get_operand_expressions(addr, expressions): insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) for i, op in enumerate(ops): op_expression = expressions[i] exps = get_operand_expressions(BE2_EXTRACTOR.be2, op) if len(exps) > 1: for j, exp in enumerate(exps): assert exp.type == op_expression[j].type assert exp.symbol == op_expression[j].symbol else: assert len(exps) == 1 assert exps[0] == op_expression @pytest.mark.parametrize( "addr,expressions", [ # 00210158 20 00 80 d2 mov x0,#0x1 (0x210158, ("x0", None)), # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 (0x21015C, ("x1", None)), # 0021019c e1 03 00 aa mov x1,x0 (0x21019C, ("x1", "x0")), # 00210190 21 04 00 91 add x1,x1,#0x1 (0x210190, ("x1", "x1", None)), ], ) def test_get_operand_register_expression(addr, expressions): insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) for i, op in enumerate(ops): reg_exp = get_operand_register_expression(BE2_EXTRACTOR.be2, op) if reg_exp is None: assert reg_exp == expressions[i] else: assert reg_exp.symbol == expressions[i] @pytest.mark.parametrize( "addr,expressions", [ # 00210158 20 00 80 d2 mov x0,#0x1 (0x210158, (None, 0x1)), # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 (0x21015C, (None, 0x2101B0)), # 002101a8 01 00 00 d4 svc 0x0 (0x2101A8, (0x0,)), # 00210190 21 04 00 91 add x1,x1,#0x1 (0x210190, (None, None, 0x1)), ], ) def test_get_operand_immediate_expression(addr, expressions): insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) for i, op in enumerate(ops): reg_exp = get_operand_immediate_expression(BE2_EXTRACTOR.be2, op) if reg_exp is None: assert reg_exp == expressions[i] else: assert reg_exp.immediate == expressions[i] """ mov x0, 0x20 bl 0x100 add x0, sp, 0x10 """ BE2_DICT: dict[str, Any] = { "expression": [ {"type": BinExport2.Expression.REGISTER, "symbol": "x0"}, {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x20}, {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x100}, {"type": BinExport2.Expression.REGISTER, "symbol": "sp"}, {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x10}, ], # operand consists of 1 or more expressions, linked together as a tree "operand": [ {"expression_index": [0]}, {"expression_index": [1]}, {"expression_index": [2]}, {"expression_index": [3]}, {"expression_index": [4]}, ], "mnemonic": [ {"name": "mov"}, # mnem 0 {"name": "bl"}, # mnem 1 {"name": "add"}, # mnem 2 ], # instruction may have 0 or more operands "instruction": [ {"mnemonic_index": 0, "operand_index": [0, 1]}, {"mnemonic_index": 1, "operand_index": [2]}, {"mnemonic_index": 2, "operand_index": [0, 3, 4]}, ], } BE2 = ParseDict( BE2_DICT, BinExport2(), ) def test_is_stack_register_expression(): mov = ParseDict(BE2_DICT["instruction"][0], BinExport2.Instruction()) add = ParseDict(BE2_DICT["instruction"][2], BinExport2.Instruction()) mov_op0, mov_op1 = get_instruction_operands(BE2, mov) op0_exp0 = get_operand_expressions(BE2, mov_op0)[0] assert is_stack_register_expression(BE2, op0_exp0) is False op0_exp1 = get_operand_expressions(BE2, mov_op1)[0] assert is_stack_register_expression(BE2, op0_exp1) is False add_op0, add_op1, add_op2 = get_instruction_operands(BE2, add) op0_exp0 = get_operand_expressions(BE2, add_op0)[0] assert is_stack_register_expression(BE2, op0_exp0) is False op1_exp0 = get_operand_expressions(BE2, add_op1)[0] assert is_stack_register_expression(BE2, op1_exp0) is True op2_exp0 = get_operand_expressions(BE2, add_op2)[0] assert is_stack_register_expression(BE2, op2_exp0) is False def test_split_with_delimiters(): assert tuple(split_with_delimiters("abc|def", ("|",))) == ("abc", "|", "def") assert tuple(split_with_delimiters("abc|def|", ("|",))) == ("abc", "|", "def", "|") assert tuple(split_with_delimiters("abc||def", ("|",))) == ("abc", "|", "", "|", "def") assert tuple(split_with_delimiters("abc|def-ghi", ("|", "-"))) == ("abc", "|", "def", "-", "ghi") def test_pattern_parsing(): assert BinExport2InstructionPattern.from_str( "br reg ; capture reg" ) == BinExport2InstructionPattern(mnemonics=("br",), operands=("reg",), capture="reg") assert BinExport2InstructionPattern.from_str( "mov reg0, reg1 ; capture reg0" ) == BinExport2InstructionPattern(mnemonics=("mov",), operands=("reg0", "reg1"), capture="reg0") assert BinExport2InstructionPattern.from_str( "adrp reg, #int ; capture #int" ) == BinExport2InstructionPattern(mnemonics=("adrp",), operands=("reg", "#int"), capture="#int") assert BinExport2InstructionPattern.from_str( "add reg, reg, #int ; capture #int" ) == BinExport2InstructionPattern(mnemonics=("add",), operands=("reg", "reg", "#int"), capture="#int") assert BinExport2InstructionPattern.from_str( "ldr reg0, [reg1] ; capture reg1" ) == BinExport2InstructionPattern(mnemonics=("ldr",), operands=("reg0", ("[", "reg1")), capture="reg1") assert BinExport2InstructionPattern.from_str( "ldr|str reg, [reg, #int] ; capture #int" ) == BinExport2InstructionPattern( mnemonics=( "ldr", "str", ), operands=("reg", ("[", "reg", ",", "#int")), capture="#int", ) assert BinExport2InstructionPattern.from_str( "ldr|str reg, [reg, #int]! ; capture #int" ) == BinExport2InstructionPattern( mnemonics=( "ldr", "str", ), operands=("reg", ("!", "[", "reg", ",", "#int")), capture="#int", ) assert BinExport2InstructionPattern.from_str( "ldr|str reg, [reg], #int ; capture #int" ) == BinExport2InstructionPattern( mnemonics=( "ldr", "str", ), operands=( "reg", ( "[", "reg", ), "#int", ), capture="#int", ) assert BinExport2InstructionPattern.from_str( "ldp|stp reg, reg, [reg, #int] ; capture #int" ) == BinExport2InstructionPattern( mnemonics=( "ldp", "stp", ), operands=("reg", "reg", ("[", "reg", ",", "#int")), capture="#int", ) assert BinExport2InstructionPattern.from_str( "ldp|stp reg, reg, [reg, #int]! ; capture #int" ) == BinExport2InstructionPattern( mnemonics=( "ldp", "stp", ), operands=("reg", "reg", ("!", "[", "reg", ",", "#int")), capture="#int", ) assert BinExport2InstructionPattern.from_str( "ldp|stp reg, reg, [reg], #int ; capture #int" ) == BinExport2InstructionPattern( mnemonics=( "ldp", "stp", ), operands=("reg", "reg", ("[", "reg"), "#int"), capture="#int", ) assert ( BinExport2InstructionPatternMatcher.from_str( """ # comment br reg br reg(not-stack) br reg ; capture reg mov reg0, reg1 ; capture reg0 adrp reg, #int ; capture #int add reg, reg, #int ; capture #int ldr reg0, [reg1] ; capture reg1 ldr|str reg, [reg, #int] ; capture #int ldr|str reg, [reg, #int]! ; capture #int ldr|str reg, [reg], #int ; capture #int ldp|stp reg, reg, [reg, #int] ; capture #int ldp|stp reg, reg, [reg, #int]! ; capture #int ldp|stp reg, reg, [reg], #int ; capture #int ldrb reg0, [reg1, reg2] ; capture reg2 call [reg + reg * #int + #int] call [reg + reg * #int] call [reg * #int + #int] call [reg + reg + #int] call [reg + #int] """ ).queries is not None ) def match_address(extractor: BinExport2FeatureExtractor, queries: BinExport2InstructionPatternMatcher, address: int): instruction = extractor.idx.insn_by_address[address] mnemonic: str = get_instruction_mnemonic(extractor.be2, instruction) operands = [] for operand_index in instruction.operand_index: operand = extractor.be2.operand[operand_index] operands.append(capa.features.extractors.binexport2.helpers.get_operand_expressions(extractor.be2, operand)) return queries.match(mnemonic, operands) def match_address_with_be2( extractor: BinExport2FeatureExtractor, queries: BinExport2InstructionPatternMatcher, address: int ): instruction_index = extractor.idx.insn_index_by_address[address] return queries.match_with_be2(extractor.be2, instruction_index) def test_pattern_matching(): queries = BinExport2InstructionPatternMatcher.from_str( """ br reg(stack) ; capture reg br reg(not-stack) ; capture reg mov reg0, reg1 ; capture reg0 adrp reg, #int ; capture #int add reg, reg, #int ; capture #int ldr reg0, [reg1] ; capture reg1 ldr|str reg, [reg, #int] ; capture #int ldr|str reg, [reg, #int]! ; capture #int ldr|str reg, [reg], #int ; capture #int ldp|stp reg, reg, [reg, #int] ; capture #int ldp|stp reg, reg, [reg, #int]! ; capture #int ldp|stp reg, reg, [reg], #int ; capture #int ldrb reg0, [reg1(not-stack), reg2] ; capture reg2 """ ) # 0x210184: ldrb w2, [x0, x1] # query: ldrb reg0, [reg1(not-stack), reg2] ; capture reg2" assert match_address(BE2_EXTRACTOR, queries, 0x210184).expression.symbol == "x1" assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210184).expression.symbol == "x1" # 0x210198: mov x2, x1 # query: mov reg0, reg1 ; capture reg0"), assert match_address(BE2_EXTRACTOR, queries, 0x210198).expression.symbol == "x2" assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210198).expression.symbol == "x2" # 0x210190: add x1, x1, 0x1 # query: add reg, reg, #int ; capture #int assert match_address(BE2_EXTRACTOR, queries, 0x210190).expression.immediate == 1 assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210190).expression.immediate == 1 BE2_EXTRACTOR_687 = fixtures.get_binexport_extractor( CD / "data" / "binexport2" / "687e79cde5b0ced75ac229465835054931f9ec438816f2827a8be5f3bd474929.elf_.ghidra.BinExport" ) def test_pattern_matching_exclamation(): queries = BinExport2InstructionPatternMatcher.from_str( """ stp reg, reg, [reg, #int]! ; capture #int """ ) # note this captures the sp # 0x107918: stp x20, x19, [sp,0xFFFFFFFFFFFFFFE0]! # query: stp reg, reg, [reg, #int]! ; capture #int assert match_address(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 def test_pattern_matching_stack(): queries = BinExport2InstructionPatternMatcher.from_str( """ stp reg, reg, [reg(stack), #int]! ; capture #int """ ) # note this does capture the sp # compare this with the test above (exclamation) # 0x107918: stp x20, x19, [sp, 0xFFFFFFFFFFFFFFE0]! # query: stp reg, reg, [reg(stack), #int]! ; capture #int assert match_address(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 def test_pattern_matching_not_stack(): queries = BinExport2InstructionPatternMatcher.from_str( """ stp reg, reg, [reg(not-stack), #int]! ; capture #int """ ) # note this does not capture the sp # compare this with the test above (exclamation) # 0x107918: stp x20, x19, [sp, 0xFFFFFFFFFFFFFFE0]! # query: stp reg, reg, [reg(not-stack), #int]! ; capture #int assert match_address(BE2_EXTRACTOR_687, queries, 0x107918) is None assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918) is None BE2_EXTRACTOR_MIMI = fixtures.get_binexport_extractor(CD / "data" / "binexport2" / "mimikatz.exe_.ghidra.BinExport") def test_pattern_matching_x86(): queries = BinExport2InstructionPatternMatcher.from_str( """ cmp|lea reg, [reg(not-stack) + #int0] ; capture #int0 """ ) # 0x4018c0: LEA ECX, [EBX+0x2] # query: cmp|lea reg, [reg(not-stack) + #int0] ; capture #int0 assert match_address(BE2_EXTRACTOR_MIMI, queries, 0x4018C0).expression.immediate == 2 assert match_address_with_be2(BE2_EXTRACTOR_MIMI, queries, 0x4018C0).expression.immediate == 2