From e90be5a9bb5ef9214cb0722b757f3cc5145e2491 Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Wed, 6 Dec 2023 15:29:10 +0100 Subject: [PATCH] Initial plumbing to support DEX files --- capa/features/common.py | 11 ++- capa/features/extractors/common.py | 12 +++ capa/features/extractors/dexfile.py | 91 +++++++++++++++++++ .../features/extractors/dexparser/__init__.py | 0 .../extractors/dexparser/extractor.py | 89 ++++++++++++++++++ capa/main.py | 11 +++ pyproject.toml | 1 + 7 files changed, 212 insertions(+), 3 deletions(-) create mode 100644 capa/features/extractors/dexfile.py create mode 100644 capa/features/extractors/dexparser/__init__.py create mode 100644 capa/features/extractors/dexparser/extractor.py diff --git a/capa/features/common.py b/capa/features/common.py index 0cb1396d..c4ae25cc 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -409,7 +409,9 @@ ARCH_I386 = "i386" ARCH_AMD64 = "amd64" # dotnet ARCH_ANY = "any" -VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY) +# dex +ARCH_DALVIK = "dalvik" +VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY, ARCH_DALVIK) class Arch(Feature): @@ -421,10 +423,11 @@ class Arch(Feature): OS_WINDOWS = "windows" OS_LINUX = "linux" OS_MACOS = "macos" +OS_ANDROID = "android" # dotnet OS_ANY = "any" VALID_OS = {os.value for os in capa.features.extractors.elf.OS} -VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS, OS_ANY}) +VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS, OS_ANY, OS_ANDROID}) # internal only, not to be used in rules OS_AUTO = "auto" @@ -452,7 +455,8 @@ class OS(Feature): FORMAT_PE = "pe" FORMAT_ELF = "elf" FORMAT_DOTNET = "dotnet" -VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET) +FORMAT_DEX = "dex" +VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET, FORMAT_DEX) # internal only, not to be used in rules FORMAT_AUTO = "auto" FORMAT_SC32 = "sc32" @@ -464,6 +468,7 @@ STATIC_FORMATS = { FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET, + FORMAT_DEX, } DYNAMIC_FORMATS = { FORMAT_CAPE, diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py index b7bb3c39..ab198302 100644 --- a/capa/features/extractors/common.py +++ b/capa/features/extractors/common.py @@ -24,8 +24,11 @@ from capa.features.common import ( OS_AUTO, ARCH_ANY, FORMAT_PE, + FORMAT_DEX, FORMAT_ELF, + OS_ANDROID, OS_WINDOWS, + ARCH_DALVIK, FORMAT_FREEZE, FORMAT_RESULT, Arch, @@ -41,6 +44,7 @@ logger = logging.getLogger(__name__) # match strings for formats MATCH_PE = b"MZ" MATCH_ELF = b"\x7fELF" +MATCH_DEX = b"dex\n" MATCH_RESULT = b'{"meta":' MATCH_JSON_OBJECT = b'{"' @@ -61,6 +65,8 @@ def extract_format(buf) -> Iterator[Tuple[Feature, Address]]: yield Format(FORMAT_PE), NO_ADDRESS elif buf.startswith(MATCH_ELF): yield Format(FORMAT_ELF), NO_ADDRESS + elif len(buf) > 8 and buf.startswith(MATCH_DEX) and buf[7] == 0x00: + yield Format(FORMAT_DEX), NO_ADDRESS elif is_freeze(buf): yield Format(FORMAT_FREEZE), NO_ADDRESS elif buf.startswith(MATCH_RESULT): @@ -96,6 +102,9 @@ def extract_arch(buf) -> Iterator[Tuple[Feature, Address]]: yield Arch(arch), NO_ADDRESS + elif len(buf) > 8 and buf.startswith(MATCH_DEX) and buf[7] == 0x00: + yield Arch(ARCH_DALVIK), NO_ADDRESS + else: # we likely end up here: # 1. handling shellcode, or @@ -129,6 +138,9 @@ def extract_os(buf, os=OS_AUTO) -> Iterator[Tuple[Feature, Address]]: yield OS(os), NO_ADDRESS + elif len(buf) > 8 and buf.startswith(MATCH_DEX) and buf[7] == 0x00: + yield OS(OS_ANDROID), NO_ADDRESS + else: # we likely end up here: # 1. handling shellcode, or diff --git a/capa/features/extractors/dexfile.py b/capa/features/extractors/dexfile.py new file mode 100644 index 00000000..fe544e7d --- /dev/null +++ b/capa/features/extractors/dexfile.py @@ -0,0 +1,91 @@ +# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +from typing import Tuple, Iterator +from pathlib import Path + +from dexparser import DEXParser + +from capa.features.common import OS, FORMAT_DEX, OS_ANDROID, ARCH_DALVIK, Arch, Format, Feature +from capa.features.address import NO_ADDRESS, Address +from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor + +logger = logging.getLogger(__name__) + + +def extract_file_format(**kwargs) -> Iterator[Tuple[Format, Address]]: + yield Format(FORMAT_DEX), NO_ADDRESS + + +FILE_HANDLERS = (extract_file_format,) + + +def extract_file_features(dex: DEXParser) -> Iterator[Tuple[Feature, Address]]: + for file_handler in FILE_HANDLERS: + for feature, addr in file_handler(dex=dex): # type: ignore + yield feature, addr + + +def extract_file_os(**kwargs) -> Iterator[Tuple[OS, Address]]: + yield OS(OS_ANDROID), NO_ADDRESS + + +def extract_file_arch(**kwargs) -> Iterator[Tuple[Arch, Address]]: + yield Arch(ARCH_DALVIK), NO_ADDRESS + + +GLOBAL_HANDLERS = ( + extract_file_os, + extract_file_arch, +) + + +def extract_global_features(dex: DEXParser) -> Iterator[Tuple[Feature, Address]]: + for handler in GLOBAL_HANDLERS: + for feature, va in handler(dex=dex): # type: ignore + yield feature, va + + +class DexFileFeatureExtractor(StaticFeatureExtractor): + def __init__(self, path: Path): + super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes())) + self.path: Path = path + self.dex = DEXParser(filedir=str(path)) + + def get_base_address(self): + return NO_ADDRESS + + def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: + yield from extract_global_features(self.dex) + + def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: + yield from extract_file_features(self.dex) + + def get_functions(self): + raise NotImplementedError("DexFileFeatureExtractor can only be used to extract file features") + + def extract_function_features(self, f): + raise NotImplementedError("DexFileFeatureExtractor can only be used to extract file features") + + def get_basic_blocks(self, f): + raise NotImplementedError("DexFileFeatureExtractor can only be used to extract file features") + + def extract_basic_block_features(self, f, bb): + raise NotImplementedError("DexFileFeatureExtractor can only be used to extract file features") + + def get_instructions(self, f, bb): + raise NotImplementedError("DexFileFeatureExtractor can only be used to extract file features") + + def extract_insn_features(self, f, bb, insn): + raise NotImplementedError("DexFileFeatureExtractor can only be used to extract file features") + + def is_library_function(self, va): + raise NotImplementedError("DexFileFeatureExtractor can only be used to extract file features") + + def get_function_name(self, va): + raise NotImplementedError("DexFileFeatureExtractor can only be used to extract file features") diff --git a/capa/features/extractors/dexparser/__init__.py b/capa/features/extractors/dexparser/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/capa/features/extractors/dexparser/extractor.py b/capa/features/extractors/dexparser/extractor.py new file mode 100644 index 00000000..f8be97ef --- /dev/null +++ b/capa/features/extractors/dexparser/extractor.py @@ -0,0 +1,89 @@ +# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +from __future__ import annotations + +import logging +from typing import List, Tuple, Iterator +from pathlib import Path + +import dexparser + +import capa.features.extractors +import capa.features.extractors.dexfile +from capa.features.common import Feature +from capa.features.address import NO_ADDRESS, Address +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + FunctionHandle, + StaticFeatureExtractor, +) + +logger = logging.getLogger(__name__) + + +class DexparserFeatureExtractorCache: + def __init__(self, dex: dexparser.DEXParser): + self.dex = dex + + +class DexparserFeatureExtractor(StaticFeatureExtractor): + def __init__(self, path: Path): + self.dex = dexparser.DEXParser(filedir=str(path)) + super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes())) + + self.cache = DexparserFeatureExtractorCache(self.dex) + + # pre-compute these because we'll yield them at *every* scope. + self.global_features: List[Tuple[Feature, Address]] = [] + self.global_features.extend(capa.features.extractors.dexfile.extract_file_format()) + self.global_features.extend(capa.features.extractors.dexfile.extract_file_os(dex=self.dex)) + self.global_features.extend(capa.features.extractors.dexfile.extract_file_arch(dex=self.dex)) + + def todo(self): + import inspect + + logger.debug("[DexparserFeatureExtractor:TODO] " + inspect.stack()[1].function) + + def get_base_address(self): + return NO_ADDRESS + + def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: + yield from self.global_features + + def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: + return self.todo() + yield + + def get_functions(self) -> Iterator[FunctionHandle]: + return self.todo() + yield + + def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: + return self.todo() + yield + + def get_basic_blocks(self, f: FunctionHandle) -> Iterator[BBHandle]: + return self.todo() + yield + + def extract_basic_block_features(self, f: FunctionHandle, bb: BBHandle) -> Iterator[Tuple[Feature, Address]]: + return self.todo() + yield + + def get_instructions(self, f: FunctionHandle, bb: BBHandle) -> Iterator[InsnHandle]: + return self.todo() + yield + + def extract_insn_features( + self, f: FunctionHandle, bb: BBHandle, insn: InsnHandle + ) -> Iterator[Tuple[Feature, Address]]: + return self.todo() + yield diff --git a/capa/main.py b/capa/main.py index 706c442a..de28e248 100644 --- a/capa/main.py +++ b/capa/main.py @@ -44,6 +44,7 @@ import capa.render.result_document import capa.render.result_document as rdoc import capa.features.extractors.common import capa.features.extractors.pefile +import capa.features.extractors.dexfile import capa.features.extractors.elffile import capa.features.extractors.dotnetfile import capa.features.extractors.base_extractor @@ -71,6 +72,7 @@ from capa.features.common import ( OS_LINUX, OS_MACOS, FORMAT_PE, + FORMAT_DEX, FORMAT_ELF, OS_WINDOWS, FORMAT_AUTO, @@ -306,6 +308,11 @@ def get_extractor( return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path) + elif format_ == FORMAT_DEX: + import capa.features.extractors.dexparser.extractor + + return capa.features.extractors.dexparser.extractor.DexparserFeatureExtractor(path) + elif backend == BACKEND_BINJA: from capa.features.extractors.binja.find_binja_api import find_binja_path @@ -374,6 +381,9 @@ def get_file_extractors(sample: Path, format_: str) -> List[FeatureExtractor]: elif format_ == capa.features.common.FORMAT_ELF: file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(sample)) + elif format_ == capa.features.common.FORMAT_DEX: + file_extractors.append(capa.features.extractors.dexfile.DexFileFeatureExtractor(sample)) + elif format_ == FORMAT_CAPE: report = json.load(Path(sample).open(encoding="utf-8")) file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report)) @@ -796,6 +806,7 @@ def install_common_args(parser, wanted=None): (FORMAT_PE, "Windows PE file"), (FORMAT_DOTNET, ".NET PE file"), (FORMAT_ELF, "Executable and Linkable Format"), + (FORMAT_DEX, "Android DEX file"), (FORMAT_SC32, "32-bit shellcode"), (FORMAT_SC64, "64-bit shellcode"), (FORMAT_CAPE, "CAPE sandbox report"), diff --git a/pyproject.toml b/pyproject.toml index 38f2e80c..23488d2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ dependencies = [ "dncil==1.0.2", "pydantic==2.4.0", "protobuf==4.23.4", + "dexparser==1.2.0", ] dynamic = ["version"]