diff --git a/capa/helpers.py b/capa/helpers.py index dd0ff79e..eaee04bd 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -21,3 +21,12 @@ def get_file_taste(sample_path: str) -> bytes: with open(sample_path, "rb") as f: taste = f.read(8) return taste + + +def is_runtime_ida(): + try: + import idc + except ImportError: + return False + else: + return True \ No newline at end of file diff --git a/capa/ida/helpers.py b/capa/ida/helpers.py index b2e8f0fc..800d1383 100644 --- a/capa/ida/helpers.py +++ b/capa/ida/helpers.py @@ -12,6 +12,8 @@ import datetime import idc import idaapi import idautils +import ida_bytes +import ida_loader import capa import capa.version @@ -121,3 +123,30 @@ def collect_metadata(): }, "version": capa.version.__version__, } + + +class IDAIO: + """ + An object that acts as a file-like object, + using bytes from the current IDB workspace. + """ + + def __init__(self): + super(IDAIO, self).__init__() + self.offset = 0 + + def seek(self, offset, whence=0): + assert whence == 0 + self.offset = offset + + def read(self, size): + ea = ida_loader.get_fileregion_ea(self.offset) + if ea == idc.BADADDR: + # best guess, such as if file is mapped at address 0x0. + ea = self.offset + + logger.debug("reading 0x%x bytes at 0x%x (ea: 0x%x)", size, self.offset, ea) + return ida_bytes.get_bytes(ea, size) + + def close(self): + return diff --git a/scripts/detect-elf-os.py b/scripts/detect-elf-os.py index f63dda4b..950d64df 100644 --- a/scripts/detect-elf-os.py +++ b/scripts/detect-elf-os.py @@ -13,278 +13,20 @@ detect-elf-os Attempt to detect the underlying OS that the given ELF file targets. """ import sys -import struct import logging import argparse import contextlib -from enum import Enum from typing import BinaryIO +import capa.helpers +import capa.features.extractors.elf + logger = logging.getLogger("capa.detect-elf-os") -def align(v, alignment): - remainder = v % alignment - if remainder == 0: - return v - else: - return v + remainder - - -class IDAIO: - """ - An object that acts as a file-like object, - using bytes from the current IDB workspace. - """ - - def __init__(self): - assert IDAIO.is_runtime_ida() == True - - super(IDAIO, self).__init__() - import idc - import ida_bytes - import ida_loader - - self.offset = 0 - - def seek(self, offset, whence=0): - assert whence == 0 - self.offset = offset - - def read(self, size): - ea = ida_loader.get_fileregion_ea(self.offset) - if ea == idc.BADADDR: - # best guess, such as if file is mapped at address 0x0. - ea = self.offset - - logger.debug("reading 0x%x bytes at 0x%x (ea: 0x%x)", size, self.offset, ea) - return ida_bytes.get_bytes(ea, size) - - def close(self): - return - - @staticmethod - def is_runtime_ida(): - try: - import idc - except ImportError: - return False - else: - return True - - -class CorruptElfFile(ValueError): - pass - - -class OS(str, Enum): - HPUX = "HPUX" - NETBSD = "NETBSD" - LINUX = "LINUX" - HURD = "HURD" - _86OPEN = "86OPEN" - SOLARIS = "SOLARIS" - AIX = "AIX" - IRIX = "IRIX" - FREEBSD = "FREEBSD" - TRU64 = "TRU64" - MODESTO = "MODESTO" - OPENBSD = "OPENBSD" - OPENVMS = "OPENVMS" - NSK = "NSK" - AROS = "AROS" - FENIXOS = "FENIXOS" - CLOUD = "CLOUD" - SORTFIX = "SORTFIX" - ARM_AEABI = "ARM_AEABI" - SYLLABLE = "SYLLABLE" - NACL = "NACL" - - -def detect_elf_os(f: BinaryIO) -> str: - f.seek(0x0) - file_header = f.read(0x40) - - # we'll set this to the detected OS - # prefer the first heuristics, - # but rather than short circuiting, - # we'll still parse out the remainder, for debugging. - ret = None - - if not file_header.startswith(b"\x7fELF"): - raise CorruptElfFile("missing magic header") - - ei_class, ei_data = struct.unpack_from("BB", file_header, 4) - logger.debug("ei_class: 0x%02x ei_data: 0x%02x", ei_class, ei_data) - if ei_class == 1: - bitness = 32 - elif ei_class == 2: - bitness = 64 - else: - raise CorruptElfFile("invalid ei_class: 0x%02x" % ei_class) - - if ei_data == 1: - endian = "<" - elif ei_data == 2: - endian = ">" - else: - raise CorruptElfFile("not an ELF file: invalid ei_data: 0x%02x" % ei_data) - - if bitness == 32: - (e_phoff,) = struct.unpack_from(endian + "I", file_header, 0x1C) - e_phentsize, e_phnum = struct.unpack_from(endian + "HH", file_header, 0x2A) - elif bitness == 64: - (e_phoff,) = struct.unpack_from(endian + "Q", file_header, 0x20) - e_phentsize, e_phnum = struct.unpack_from(endian + "HH", file_header, 0x36) - else: - raise NotImplemented - - logger.debug("e_phoff: 0x%02x e_phentsize: 0x%02x e_phnum: %d", e_phoff, e_phentsize, e_phnum) - - (ei_osabi,) = struct.unpack_from(endian + "B", file_header, 7) - OSABI = { - # via pyelftools: https://github.com/eliben/pyelftools/blob/0664de05ed2db3d39041e2d51d19622a8ef4fb0f/elftools/elf/enums.py#L35-L58 - # 0: "SYSV", - 1: OS.HPUX, - 2: OS.NETBSD, - 3: OS.LINUX, - 4: OS.HURD, - 5: OS._86OPEN, - 6: OS.SOLARIS, - 7: OS.AIX, - 8: OS.IRIX, - 9: OS.FREEBSD, - 10: OS.TRU64, - 11: OS.MODESTO, - 12: OS.OPENBSD, - 13: OS.OPENVMS, - 14: OS.NSK, - 15: OS.AROS, - 16: OS.FENIXOS, - 17: OS.CLOUD, - # 53: "SORTFIX", - # 64: "ARM_AEABI", - # 97: "ARM", - # 255: "STANDALONE", - } - logger.debug("ei_osabi: 0x%02x (%s)", ei_osabi, OSABI.get(ei_osabi, "unknown")) - - if ei_osabi in OSABI and ei_osabi != 0x0: - # update only if not set - # so we can get the debugging output of subsequent strategies - ret = OSABI[ei_osabi] if not ret else ret - - f.seek(e_phoff) - program_header_size = e_phnum * e_phentsize - program_headers = f.read(program_header_size) - if len(program_headers) != program_header_size: - logger.warning("failed to read program headers") - e_phnum = 0 - - for i in range(e_phnum): - offset = i * e_phentsize - phent = program_headers[offset : offset + e_phentsize] - - PT_NOTE = 0x4 - - (p_type,) = struct.unpack_from(endian + "I", phent, 0x0) - logger.debug("p_type: 0x%04x", p_type) - if p_type != PT_NOTE: - continue - - if bitness == 32: - p_offset, _, _, p_filesz = struct.unpack_from(endian + "IIII", phent, 0x4) - elif bitness == 64: - p_offset, _, _, p_filesz = struct.unpack_from(endian + "QQQQ", phent, 0x8) - else: - raise NotImplemented - - logger.debug("p_offset: 0x%02x p_filesz: 0x%04x", p_offset, p_filesz) - - f.seek(p_offset) - note = f.read(p_filesz) - if len(note) != p_filesz: - logger.warning("failed to read note content") - continue - - namesz, descsz, type_ = struct.unpack_from(endian + "III", note, 0x0) - name_offset = 0xC - desc_offset = name_offset + align(namesz, 0x4) - - logger.debug("namesz: 0x%02x descsz: 0x%02x type: 0x%04x", namesz, descsz, type_) - - name = note[name_offset : name_offset + namesz].partition(b"\x00")[0].decode("ascii") - logger.debug("name: %s", name) - - if type_ != 1: - continue - - if name == "GNU": - if descsz < 16: - continue - - desc = note[desc_offset : desc_offset + descsz] - abi_tag, kmajor, kminor, kpatch = struct.unpack_from(endian + "IIII", desc, 0x0) - # via readelf: https://github.com/bminor/binutils-gdb/blob/c0e94211e1ac05049a4ce7c192c9d14d1764eb3e/binutils/readelf.c#L19635-L19658 - # and here: https://github.com/bminor/binutils-gdb/blob/34c54daa337da9fadf87d2706d6a590ae1f88f4d/include/elf/common.h#L933-L939 - GNU_ABI_TAG = { - 0: OS.LINUX, - 1: OS.HURD, - 2: OS.SOLARIS, - 3: OS.FREEBSD, - 4: OS.NETBSD, - 5: OS.SYLLABLE, - 6: OS.NACL, - } - logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag) - - if abi_tag in GNU_ABI_TAG: - # update only if not set - # so we can get the debugging output of subsequent strategies - ret = GNU_ABI_TAG[abi_tag] if not ret else ret - logger.debug("abi tag: %s earliest compatible kernel: %d.%d.%d", ret, kmajor, kminor, kpatch) - elif name == "OpenBSD": - logger.debug("note owner: %s", "OPENBSD") - ret = OS.OPENBSD if not ret else ret - elif name == "NetBSD": - logger.debug("note owner: %s", "NETBSD") - ret = OS.NETBSD if not ret else ret - - for i in range(e_phnum): - offset = i * e_phentsize - phent = program_headers[offset : offset + e_phentsize] - - PT_INTERP = 0x3 - - (p_type,) = struct.unpack_from(endian + "I", phent, 0x0) - if p_type != PT_INTERP: - continue - - if bitness == 32: - p_offset, _, _, p_filesz = struct.unpack_from(endian + "IIII", phent, 0x4) - elif bitness == 64: - p_offset, _, _, p_filesz = struct.unpack_from(endian + "QQQQ", phent, 0x8) - else: - raise NotImplemented - - f.seek(p_offset) - interp = f.read(p_filesz) - if len(interp) != p_filesz: - logger.warning("failed to read interp content") - continue - - linker = interp.partition(b"\x00")[0].decode("ascii") - logger.debug("linker: %s", linker) - if "ld-linux" in linker: - # update only if not set - # so we can get the debugging output of subsequent strategies - ret = OS.LINUX if ret is None else ret - - return ret.value if ret is not None else "unknown" - - def main(argv=None): - if IDAIO.is_runtime_ida(): + if capa.helpers.is_runtime_ida(): + from capa.ida.helpers import IDAIO f: BinaryIO = IDAIO() else: @@ -318,15 +60,15 @@ def main(argv=None): with contextlib.closing(f): try: - print(detect_elf_os(f)) + print(capa.features.extractors.elf.detect_elf_os(f)) return 0 - except CorruptElfFile as e: + except capa.features.extractors.elf.CorruptElfFile as e: logger.error("corrupt ELF file: %s", str(e.args[0])) return -1 if __name__ == "__main__": - if IDAIO.is_runtime_ida(): + if capa.helpers.is_runtime_ida(): main() else: sys.exit(main())