From 06d238a9f9bdb0846dd6955b48b20fa9a1f44fe6 Mon Sep 17 00:00:00 2001 From: Moritz Raabe Date: Fri, 10 Sep 2021 20:19:23 +0200 Subject: [PATCH] add ElfFeatureExtractor --- .github/mypy/mypy.ini | 3 + CHANGELOG.md | 1 + capa/features/common.py | 3 - capa/features/extractors/base_extractor.py | 2 +- capa/features/extractors/common.py | 21 ++- capa/features/extractors/elf.py | 7 + capa/features/extractors/elffile.py | 153 +++++++++++++++++++++ capa/features/extractors/pefile.py | 13 +- capa/features/extractors/viv/file.py | 9 +- capa/main.py | 16 ++- setup.py | 1 + 11 files changed, 204 insertions(+), 25 deletions(-) create mode 100644 capa/features/extractors/elffile.py diff --git a/.github/mypy/mypy.ini b/.github/mypy/mypy.ini index 796fdc5f..6d177d40 100644 --- a/.github/mypy/mypy.ini +++ b/.github/mypy/mypy.ini @@ -71,3 +71,6 @@ ignore_missing_imports = True [mypy-devtools.*] ignore_missing_imports = True + +[mypy-elftools.*] +ignore_missing_imports = True diff --git a/CHANGELOG.md b/CHANGELOG.md index e1821d3f..8ea25726 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - rule format: add feature `os: ` for operating system, like `os: windows` #723 @williballenthin - rule format: add feature `substring: ` for verbatim strings with leading/trailing wildcards #737 @williballenthin - scripts: add `profile-memory.py` for profiling memory usage #736 @williballenthin +- main: add light weight ELF file feature extractor to detect file limitations #770 @mr-tz ### Breaking Changes diff --git a/capa/features/common.py b/capa/features/common.py index 1515275d..5a45cc86 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -344,7 +344,6 @@ VALID_ARCH = (ARCH_I386, ARCH_AMD64) class Arch(Feature): def __init__(self, value: str, description=None): - assert value in VALID_ARCH super(Arch, self).__init__(value, description=description) self.name = "arch" @@ -358,7 +357,6 @@ VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS}) class OS(Feature): def __init__(self, value: str, description=None): - assert value in (VALID_OS) super(OS, self).__init__(value, description=description) self.name = "os" @@ -370,7 +368,6 @@ VALID_FORMAT = (FORMAT_PE, FORMAT_ELF) class Format(Feature): def __init__(self, value: str, description=None): - assert value in (VALID_FORMAT) super(Format, self).__init__(value, description=description) self.name = "format" diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 67ba9451..28e2f63c 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -9,7 +9,7 @@ import abc from typing import Tuple, Iterator, SupportsInt -from capa.features.basicblock import Feature +from capa.features.common import Feature # feature extractors may reference functions, BBs, insns by opaque handle values. # the only requirement of these handles are that they support `__int__`, diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py index 450fb636..99f0ea08 100644 --- a/capa/features/extractors/common.py +++ b/capa/features/extractors/common.py @@ -5,13 +5,25 @@ import contextlib import pefile +import capa.features import capa.features.extractors.elf import capa.features.extractors.pefile -from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, Arch, Format +from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, Arch, Format, String logger = logging.getLogger(__name__) +def extract_file_strings(buf, **kwargs): + """ + extract ASCII and UTF-16 LE strings from file + """ + for s in capa.features.extractors.strings.extract_ascii_strings(buf): + yield String(s.s), s.offset + + for s in capa.features.extractors.strings.extract_unicode_strings(buf): + yield String(s.s), s.offset + + def extract_format(buf): if buf.startswith(b"MZ"): yield Format(FORMAT_PE), 0x0 @@ -34,7 +46,7 @@ def extract_arch(buf): with contextlib.closing(io.BytesIO(buf)) as f: arch = capa.features.extractors.elf.detect_elf_arch(f) - if arch == "unknown": + if arch not in capa.features.common.VALID_ARCH: logger.debug("unsupported arch: %s", arch) return @@ -62,7 +74,12 @@ def extract_os(buf): with contextlib.closing(io.BytesIO(buf)) as f: os = capa.features.extractors.elf.detect_elf_os(f) + if os not in capa.features.common.VALID_OS: + logger.debug("unsupported os: %s", os) + return + yield OS(os), 0x0 + else: # we likely end up here: # 1. handling shellcode, or diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py index ce3fe638..6e622b5c 100644 --- a/capa/features/extractors/elf.py +++ b/capa/features/extractors/elf.py @@ -1,3 +1,10 @@ +# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. import struct import logging from enum import Enum diff --git a/capa/features/extractors/elffile.py b/capa/features/extractors/elffile.py new file mode 100644 index 00000000..31330053 --- /dev/null +++ b/capa/features/extractors/elffile.py @@ -0,0 +1,153 @@ +# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import logging + +from elftools.elf.elffile import ELFFile, SymbolTableSection + +import capa.features.extractors.common +from capa.features.file import Import, Section +from capa.features.common import OS, FORMAT_ELF, Arch, Format +from capa.features.extractors.elf import Arch as ElfArch +from capa.features.extractors.base_extractor import FeatureExtractor + +logger = logging.getLogger(__name__) + + +def extract_file_import_names(elf, **kwargs): + # see https://github.com/eliben/pyelftools/blob/0664de05ed2db3d39041e2d51d19622a8ef4fb0f/scripts/readelf.py#L372 + symbol_tables = [(idx, s) for idx, s in enumerate(elf.iter_sections()) if isinstance(s, SymbolTableSection)] + + for section_index, section in symbol_tables: + if not isinstance(section, SymbolTableSection): + continue + + if section["sh_entsize"] == 0: + logger.debug("Symbol table '%s' has a sh_entsize of zero!" % (section.name)) + continue + + logger.debug("Symbol table '%s' contains %s entries:" % (section.name, section.num_symbols())) + + for nsym, symbol in enumerate(section.iter_symbols()): + if symbol.name and symbol.entry.st_info.type == "STT_FUNC": + # TODO symbol address + # TODO symbol version info? + yield Import(symbol.name), 0x0 + + +def extract_file_section_names(elf, **kwargs): + for section in elf.iter_sections(): + if section.name: + yield Section(section.name), section.header.sh_addr + elif section.is_null(): + yield Section("NULL"), section.header.sh_addr + + +def extract_file_strings(buf, **kwargs): + yield from capa.features.extractors.common.extract_file_strings(buf) + + +def extract_file_os(elf, buf, **kwargs): + # our current approach does not always get an OS value, e.g. for packed samples + # for file limitation purposes, we're more lax here + try: + os = next(capa.features.extractors.common.extract_os(buf)) + yield os + except StopIteration: + yield OS("unknown"), 0x0 + + +def extract_file_format(**kwargs): + yield Format(FORMAT_ELF), 0x0 + + +def extract_file_arch(elf, **kwargs): + # TODO merge with capa.features.extractors.elf.detect_elf_arch() + arch = elf.get_machine_arch() + if arch == "x86": + yield Arch(ElfArch.I386), 0x0 + elif arch == "x64": + yield Arch(ElfArch.AMD64), 0x0 + else: + logger.warning("unsupported architecture: %s", arch) + + +def extract_file_features(elf, buf): + """ + extract file features from given sample + + args: + elf (elftools.elf.elffile.ELFFile): the parsed ELFFile + buf: the raw sample bytes + + yields: + Tuple[Feature, VA]: a feature and its location. + """ + + for file_handler in FILE_HANDLERS: + for feature, va in file_handler(elf=elf, buf=buf): + yield feature, va + + +FILE_HANDLERS = ( + # TODO extract file export names + # extract_file_export_names, + extract_file_import_names, + extract_file_section_names, + extract_file_strings, + # elffile doesn't have library matching + # extract_file_function_names, + extract_file_os, + extract_file_format, + extract_file_arch, +) + + +class ElfFeatureExtractor(FeatureExtractor): + def __init__(self, path: str): + super(ElfFeatureExtractor, self).__init__() + self.path = path + # TODO close where/when? + self.elf = ELFFile(open(self.path, "rb")) + + def get_base_address(self): + # virtual address of the first segment with type LOAD + for segment in self.elf.iter_segments(): + if segment.header.p_type == "PT_LOAD": + return segment.header.p_vaddr + + def extract_file_features(self): + with open(self.path, "rb") as f: + buf = f.read() + + for feature, va in extract_file_features(self.elf, buf): + yield feature, va + + def get_functions(self): + raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features") + + def extract_function_features(self, f): + raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features") + + def get_basic_blocks(self, f): + raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features") + + def extract_basic_block_features(self, f, bb): + raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features") + + def get_instructions(self, f, bb): + raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features") + + def extract_insn_features(self, f, bb, insn): + raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features") + + def is_library_function(self, va): + raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features") + + def get_function_name(self, va): + raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features") diff --git a/capa/features/extractors/pefile.py b/capa/features/extractors/pefile.py index 3e5d97fd..91745f9f 100644 --- a/capa/features/extractors/pefile.py +++ b/capa/features/extractors/pefile.py @@ -5,16 +5,18 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. + import logging import pefile import capa.features.common import capa.features.extractors +import capa.features.extractors.common import capa.features.extractors.helpers import capa.features.extractors.strings from capa.features.file import Export, Import, Section -from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, String, Characteristic +from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Characteristic from capa.features.extractors.base_extractor import FeatureExtractor logger = logging.getLogger(__name__) @@ -85,14 +87,7 @@ def extract_file_section_names(pe, **kwargs): def extract_file_strings(buf, **kwargs): - """ - extract ASCII and UTF-16 LE strings from file - """ - for s in capa.features.extractors.strings.extract_ascii_strings(buf): - yield String(s.s), s.offset - - for s in capa.features.extractors.strings.extract_unicode_strings(buf): - yield String(s.s), s.offset + yield from capa.features.extractors.common.extract_file_strings(buf) def extract_file_function_names(**kwargs): diff --git a/capa/features/extractors/viv/file.py b/capa/features/extractors/viv/file.py index 4b9cd13f..b0e20032 100644 --- a/capa/features/extractors/viv/file.py +++ b/capa/features/extractors/viv/file.py @@ -68,14 +68,7 @@ def extract_file_section_names(vw, **kwargs): def extract_file_strings(buf, **kwargs): - """ - extract ASCII and UTF-16 LE strings from file - """ - for s in capa.features.extractors.strings.extract_ascii_strings(buf): - yield String(s.s), s.offset - - for s in capa.features.extractors.strings.extract_unicode_strings(buf): - yield String(s.s), s.offset + yield from capa.features.extractors.common.extract_file_strings(buf) def extract_file_function_names(vw, **kwargs): diff --git a/capa/main.py b/capa/main.py index 2b91fe26..7c804021 100644 --- a/capa/main.py +++ b/capa/main.py @@ -39,6 +39,7 @@ import capa.render.vverbose import capa.features.extractors import capa.features.extractors.common import capa.features.extractors.pefile +import capa.features.extractors.elffile from capa.rules import Rule, RuleSet from capa.engine import FeatureSet, MatchResults from capa.helpers import get_file_taste @@ -942,8 +943,8 @@ def main(argv=None): return -1 if args.format == "pe" or (args.format == "auto" and taste.startswith(b"MZ")): - # this pefile file feature extractor is pretty light weight: it doesn't do any code analysis. - # so we can fairly quickly determine if the given PE file has "pure" file-scope rules + # these pefile and elffile file feature extractors are pretty light weight: they don't do any code analysis. + # so we can fairly quickly determine if the given file has "pure" file-scope rules # that indicate a limitation (like "file is packed based on section names") # and avoid doing a full code analysis on difficult/impossible binaries. try: @@ -953,6 +954,17 @@ def main(argv=None): except PEFormatError as e: logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e)) return -1 + + elif args.format == "elf" or (args.format == "auto" and taste.startswith(b"\x7fELF")): + try: + from elftools.common.exceptions import ELFError + + file_extractor = capa.features.extractors.elffile.ElfFeatureExtractor(args.sample) + except ELFError as e: + logger.error("Input file '%s' is not a valid ELF file: %s", args.sample, str(e)) + return -1 + + if file_extractor: pure_file_capabilities, _ = find_file_capabilities(rules, file_extractor, {}) # file limitations that rely on non-file scope won't be detected here. diff --git a/setup.py b/setup.py index 51a460f1..6d1108f8 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ requirements = [ "smda==1.6.2", "pefile==2021.9.3", "typing==3.7.4.3", + "pyelftools==0.27", ] # this sets __version__