diff --git a/capa/features/extractors/dnfile/extractor.py b/capa/features/extractors/dnfile/extractor.py index f82364a2..2a40d263 100644 --- a/capa/features/extractors/dnfile/extractor.py +++ b/capa/features/extractors/dnfile/extractor.py @@ -8,17 +8,20 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, List, Tuple +from typing import TYPE_CHECKING, List, Tuple + +from capa.features.address import Address, DNTokenAddress, DNTokenOffsetAddress, AbsoluteVirtualAddress if TYPE_CHECKING: from capa.features.common import Feature import dnfile +from dncil.clr.token import Token import capa.features.extractors import capa.features.extractors.dnfile.file import capa.features.extractors.dnfile.insn -from capa.features.extractors.base_extractor import FeatureExtractor +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor from capa.features.extractors.dnfile.helpers import get_dotnet_managed_method_bodies @@ -28,12 +31,12 @@ class DnfileFeatureExtractor(FeatureExtractor): self.pe: dnfile.dnPE = dnfile.dnPE(path) # pre-compute these because we'll yield them at *every* scope. - self.global_features: List[Tuple[Feature, int]] = [] + self.global_features: List[Tuple[Feature, Address]] = [] self.global_features.extend(capa.features.extractors.dotnetfile.extract_file_os(pe=self.pe)) self.global_features.extend(capa.features.extractors.dotnetfile.extract_file_arch(pe=self.pe)) def get_base_address(self): - return 0x0 + return AbsoluteVirtualAddress(0x0) def extract_global_features(self): yield from self.global_features @@ -42,14 +45,8 @@ class DnfileFeatureExtractor(FeatureExtractor): yield from capa.features.extractors.dnfile.file.extract_features(self.pe) def get_functions(self): - # data structure shared across functions yielded here. - # useful for caching analysis relevant across a single workspace. - ctx = {} - ctx["pe"] = self.pe - - for f in get_dotnet_managed_method_bodies(self.pe): - setattr(f, "ctx", ctx) - yield f + for token, f in get_dotnet_managed_method_bodies(self.pe): + yield FunctionHandle(address=DNTokenAddress(Token(token)), inner=f, ctx={"pe": self.pe}) def extract_function_features(self, f): # TODO @@ -57,14 +54,18 @@ class DnfileFeatureExtractor(FeatureExtractor): def get_basic_blocks(self, f): # each dotnet method is considered 1 basic block - yield f + yield BBHandle( + address=f.address, + inner=f.inner, + ) def extract_basic_block_features(self, f, bb): # we don't support basic block features yield from [] def get_instructions(self, f, bb): - yield from f.instructions + for insn in bb.inner.instructions: + yield InsnHandle(address=DNTokenOffsetAddress(bb.address.token, insn.offset), inner=insn) def extract_insn_features(self, f, bb, insn): yield from capa.features.extractors.dnfile.insn.extract_features(f, bb, insn) diff --git a/capa/features/extractors/dnfile/insn.py b/capa/features/extractors/dnfile/insn.py index 262b9779..54361524 100644 --- a/capa/features/extractors/dnfile/insn.py +++ b/capa/features/extractors/dnfile/insn.py @@ -11,10 +11,13 @@ from __future__ import annotations from typing import TYPE_CHECKING, Dict, Tuple, Iterator, Optional from itertools import chain +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle + if TYPE_CHECKING: from dncil.cil.instruction import Instruction from dncil.cil.body import CilMethodBody from capa.features.common import Feature + from capa.features.address import Address from dncil.clr.token import StringToken from dncil.cil.opcode import OpCodes @@ -38,8 +41,11 @@ def get_imports(ctx: Dict) -> Dict: return ctx["imports_cache"] -def extract_insn_api_features(f: CilMethodBody, bb: CilMethodBody, insn: Instruction) -> Iterator[Tuple[API, int]]: +def extract_insn_api_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: """parse instruction API features""" + f: CilMethodBody = fh.inner + insn: Instruction = ih.inner + if insn.opcode not in (OpCodes.Call, OpCodes.Callvirt, OpCodes.Jmp, OpCodes.Calli): return @@ -49,26 +55,27 @@ def extract_insn_api_features(f: CilMethodBody, bb: CilMethodBody, insn: Instruc if "::" in name: # like System.IO.File::OpenRead - yield API(name), insn.offset + yield API(name), ih.address else: # like kernel32.CreateFileA dll, _, symbol = name.rpartition(".") for name_variant in capa.features.extractors.helpers.generate_symbols(dll, symbol): - yield API(name_variant), insn.offset + yield API(name_variant), ih.address -def extract_insn_number_features( - f: CilMethodBody, bb: CilMethodBody, insn: Instruction -) -> Iterator[Tuple[Number, int]]: +def extract_insn_number_features(fh, bh, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: """parse instruction number features""" + insn: Instruction = ih.inner + if insn.is_ldc(): - yield Number(insn.get_ldc()), insn.offset + yield Number(insn.get_ldc()), ih.address -def extract_insn_string_features( - f: CilMethodBody, bb: CilMethodBody, insn: Instruction -) -> Iterator[Tuple[String, int]]: +def extract_insn_string_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: """parse instruction string features""" + f: CilMethodBody = fh.inner + insn: Instruction = ih.inner + if not insn.is_ldstr(): return @@ -79,14 +86,14 @@ def extract_insn_string_features( if user_string is None: return - yield String(user_string), insn.offset + yield String(user_string), ih.address -def extract_features(f: CilMethodBody, bb: CilMethodBody, insn: Instruction) -> Iterator[Tuple[Feature, int]]: +def extract_features(f: FunctionHandle, bb: BBHandle, insn: InsnHandle) -> Iterator[Tuple[Feature, Address]]: """extract instruction features""" for inst_handler in INSTRUCTION_HANDLERS: - for (feature, offset) in inst_handler(f, bb, insn): - yield feature, offset + for (feature, addr) in inst_handler(f, bb, insn): + yield feature, addr INSTRUCTION_HANDLERS = ( diff --git a/capa/features/extractors/dotnetfile.py b/capa/features/extractors/dotnetfile.py index a9a2c600..a541b965 100644 --- a/capa/features/extractors/dotnetfile.py +++ b/capa/features/extractors/dotnetfile.py @@ -4,51 +4,53 @@ from itertools import chain import dnfile import pefile +from dncil.clr.token import Token import capa.features.extractors.helpers from capa.features.file import Import from capa.features.common import OS, OS_ANY, ARCH_ANY, ARCH_I386, ARCH_AMD64, FORMAT_DOTNET, Arch, Format, Feature +from capa.features.address import NO_ADDRESS, Address, DNTokenAddress, DNTokenOffsetAddress, AbsoluteVirtualAddress from capa.features.extractors.base_extractor import FeatureExtractor from capa.features.extractors.dnfile.helpers import get_dotnet_managed_imports, get_dotnet_unmanaged_imports logger = logging.getLogger(__name__) -def extract_file_format(**kwargs) -> Iterator[Tuple[Format, int]]: - yield Format(FORMAT_DOTNET), 0x0 +def extract_file_format(**kwargs) -> Iterator[Tuple[Format, Address]]: + yield Format(FORMAT_DOTNET), NO_ADDRESS -def extract_file_import_names(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Import, int]]: +def extract_file_import_names(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Import, Address]]: for (token, imp) in chain(get_dotnet_managed_imports(pe), get_dotnet_unmanaged_imports(pe)): if "::" in imp: # like System.IO.File::OpenRead - yield Import(imp), token + yield Import(imp), DNTokenAddress(Token(token)) else: # like kernel32.CreateFileA dll, _, symbol = imp.rpartition(".") for symbol_variant in capa.features.extractors.helpers.generate_symbols(dll, symbol): - yield Import(symbol_variant), token + yield Import(symbol_variant), DNTokenAddress(Token(token)) -def extract_file_os(**kwargs) -> Iterator[Tuple[OS, int]]: - yield OS(OS_ANY), 0x0 +def extract_file_os(**kwargs) -> Iterator[Tuple[OS, Address]]: + yield OS(OS_ANY), NO_ADDRESS -def extract_file_arch(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Arch, int]]: +def extract_file_arch(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Arch, Address]]: # to distinguish in more detail, see https://stackoverflow.com/a/23614024/10548020 # .NET 4.5 added option: any CPU, 32-bit preferred if pe.net.Flags.CLR_32BITREQUIRED and pe.PE_TYPE == pefile.OPTIONAL_HEADER_MAGIC_PE: - yield Arch(ARCH_I386), 0x0 + yield Arch(ARCH_I386), NO_ADDRESS elif not pe.net.Flags.CLR_32BITREQUIRED and pe.PE_TYPE == pefile.OPTIONAL_HEADER_MAGIC_PE_PLUS: - yield Arch(ARCH_AMD64), 0x0 + yield Arch(ARCH_AMD64), NO_ADDRESS else: - yield Arch(ARCH_ANY), 0x0 + yield Arch(ARCH_ANY), NO_ADDRESS -def extract_file_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, int]]: +def extract_file_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, Address]]: for file_handler in FILE_HANDLERS: - for feature, va in file_handler(pe=pe): # type: ignore - yield feature, va + for feature, addr in file_handler(pe=pe): # type: ignore + yield feature, addr FILE_HANDLERS = ( @@ -59,7 +61,7 @@ FILE_HANDLERS = ( ) -def extract_global_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, int]]: +def extract_global_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, Address]]: for handler in GLOBAL_HANDLERS: for feature, va in handler(pe=pe): # type: ignore yield feature, va @@ -77,8 +79,8 @@ class DotnetFileFeatureExtractor(FeatureExtractor): self.path: str = path self.pe: dnfile.dnPE = dnfile.dnPE(path) - def get_base_address(self) -> int: - return 0x0 + def get_base_address(self): + return AbsoluteVirtualAddress(0x0) def get_entry_point(self) -> int: # self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT