From 06d238a9f9bdb0846dd6955b48b20fa9a1f44fe6 Mon Sep 17 00:00:00 2001
From: Moritz Raabe <moritz.raabe@fireeye.com>
Date: Fri, 10 Sep 2021 20:19:23 +0200
Subject: [PATCH] add ElfFeatureExtractor

---
 .github/mypy/mypy.ini                      |   3 +
 CHANGELOG.md                               |   1 +
 capa/features/common.py                    |   3 -
 capa/features/extractors/base_extractor.py |   2 +-
 capa/features/extractors/common.py         |  21 ++-
 capa/features/extractors/elf.py            |   7 +
 capa/features/extractors/elffile.py        | 153 +++++++++++++++++++++
 capa/features/extractors/pefile.py         |  13 +-
 capa/features/extractors/viv/file.py       |   9 +-
 capa/main.py                               |  16 ++-
 setup.py                                   |   1 +
 11 files changed, 204 insertions(+), 25 deletions(-)
 create mode 100644 capa/features/extractors/elffile.py

diff --git a/.github/mypy/mypy.ini b/.github/mypy/mypy.ini
index 796fdc5f..6d177d40 100644
--- a/.github/mypy/mypy.ini
+++ b/.github/mypy/mypy.ini
@@ -71,3 +71,6 @@ ignore_missing_imports = True
 
 [mypy-devtools.*]
 ignore_missing_imports = True
+
+[mypy-elftools.*]
+ignore_missing_imports = True
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e1821d3f..8ea25726 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@
 - rule format: add feature `os: ` for operating system, like `os: windows` #723 @williballenthin
 - rule format: add feature `substring: ` for verbatim strings with leading/trailing wildcards #737 @williballenthin
 - scripts: add `profile-memory.py` for profiling memory usage #736 @williballenthin
+- main: add light weight ELF file feature extractor to detect file limitations #770 @mr-tz
 
 ### Breaking Changes
 
diff --git a/capa/features/common.py b/capa/features/common.py
index 1515275d..5a45cc86 100644
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -344,7 +344,6 @@ VALID_ARCH = (ARCH_I386, ARCH_AMD64)
 
 class Arch(Feature):
     def __init__(self, value: str, description=None):
-        assert value in VALID_ARCH
         super(Arch, self).__init__(value, description=description)
         self.name = "arch"
 
@@ -358,7 +357,6 @@ VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS})
 
 class OS(Feature):
     def __init__(self, value: str, description=None):
-        assert value in (VALID_OS)
         super(OS, self).__init__(value, description=description)
         self.name = "os"
 
@@ -370,7 +368,6 @@ VALID_FORMAT = (FORMAT_PE, FORMAT_ELF)
 
 class Format(Feature):
     def __init__(self, value: str, description=None):
-        assert value in (VALID_FORMAT)
         super(Format, self).__init__(value, description=description)
         self.name = "format"
 
diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py
index 67ba9451..28e2f63c 100644
--- a/capa/features/extractors/base_extractor.py
+++ b/capa/features/extractors/base_extractor.py
@@ -9,7 +9,7 @@
 import abc
 from typing import Tuple, Iterator, SupportsInt
 
-from capa.features.basicblock import Feature
+from capa.features.common import Feature
 
 # feature extractors may reference functions, BBs, insns by opaque handle values.
 # the only requirement of these handles are that they support `__int__`,
diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py
index 450fb636..99f0ea08 100644
--- a/capa/features/extractors/common.py
+++ b/capa/features/extractors/common.py
@@ -5,13 +5,25 @@ import contextlib
 
 import pefile
 
+import capa.features
 import capa.features.extractors.elf
 import capa.features.extractors.pefile
-from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, Arch, Format
+from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, Arch, Format, String
 
 logger = logging.getLogger(__name__)
 
 
+def extract_file_strings(buf, **kwargs):
+    """
+    extract ASCII and UTF-16 LE strings from file
+    """
+    for s in capa.features.extractors.strings.extract_ascii_strings(buf):
+        yield String(s.s), s.offset
+
+    for s in capa.features.extractors.strings.extract_unicode_strings(buf):
+        yield String(s.s), s.offset
+
+
 def extract_format(buf):
     if buf.startswith(b"MZ"):
         yield Format(FORMAT_PE), 0x0
@@ -34,7 +46,7 @@ def extract_arch(buf):
         with contextlib.closing(io.BytesIO(buf)) as f:
             arch = capa.features.extractors.elf.detect_elf_arch(f)
 
-        if arch == "unknown":
+        if arch not in capa.features.common.VALID_ARCH:
             logger.debug("unsupported arch: %s", arch)
             return
 
@@ -62,7 +74,12 @@ def extract_os(buf):
         with contextlib.closing(io.BytesIO(buf)) as f:
             os = capa.features.extractors.elf.detect_elf_os(f)
 
+        if os not in capa.features.common.VALID_OS:
+            logger.debug("unsupported os: %s", os)
+            return
+
         yield OS(os), 0x0
+
     else:
         # we likely end up here:
         #  1. handling shellcode, or
diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py
index ce3fe638..6e622b5c 100644
--- a/capa/features/extractors/elf.py
+++ b/capa/features/extractors/elf.py
@@ -1,3 +1,10 @@
+# Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
 import struct
 import logging
 from enum import Enum
diff --git a/capa/features/extractors/elffile.py b/capa/features/extractors/elffile.py
new file mode 100644
index 00000000..31330053
--- /dev/null
+++ b/capa/features/extractors/elffile.py
@@ -0,0 +1,153 @@
+# Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+import logging
+
+from elftools.elf.elffile import ELFFile, SymbolTableSection
+
+import capa.features.extractors.common
+from capa.features.file import Import, Section
+from capa.features.common import OS, FORMAT_ELF, Arch, Format
+from capa.features.extractors.elf import Arch as ElfArch
+from capa.features.extractors.base_extractor import FeatureExtractor
+
+logger = logging.getLogger(__name__)
+
+
+def extract_file_import_names(elf, **kwargs):
+    # see https://github.com/eliben/pyelftools/blob/0664de05ed2db3d39041e2d51d19622a8ef4fb0f/scripts/readelf.py#L372
+    symbol_tables = [(idx, s) for idx, s in enumerate(elf.iter_sections()) if isinstance(s, SymbolTableSection)]
+
+    for section_index, section in symbol_tables:
+        if not isinstance(section, SymbolTableSection):
+            continue
+
+        if section["sh_entsize"] == 0:
+            logger.debug("Symbol table '%s' has a sh_entsize of zero!" % (section.name))
+            continue
+
+        logger.debug("Symbol table '%s' contains %s entries:" % (section.name, section.num_symbols()))
+
+        for nsym, symbol in enumerate(section.iter_symbols()):
+            if symbol.name and symbol.entry.st_info.type == "STT_FUNC":
+                # TODO symbol address
+                # TODO symbol version info?
+                yield Import(symbol.name), 0x0
+
+
+def extract_file_section_names(elf, **kwargs):
+    for section in elf.iter_sections():
+        if section.name:
+            yield Section(section.name), section.header.sh_addr
+        elif section.is_null():
+            yield Section("NULL"), section.header.sh_addr
+
+
+def extract_file_strings(buf, **kwargs):
+    yield from capa.features.extractors.common.extract_file_strings(buf)
+
+
+def extract_file_os(elf, buf, **kwargs):
+    # our current approach does not always get an OS value, e.g. for packed samples
+    # for file limitation purposes, we're more lax here
+    try:
+        os = next(capa.features.extractors.common.extract_os(buf))
+        yield os
+    except StopIteration:
+        yield OS("unknown"), 0x0
+
+
+def extract_file_format(**kwargs):
+    yield Format(FORMAT_ELF), 0x0
+
+
+def extract_file_arch(elf, **kwargs):
+    # TODO merge with capa.features.extractors.elf.detect_elf_arch()
+    arch = elf.get_machine_arch()
+    if arch == "x86":
+        yield Arch(ElfArch.I386), 0x0
+    elif arch == "x64":
+        yield Arch(ElfArch.AMD64), 0x0
+    else:
+        logger.warning("unsupported architecture: %s", arch)
+
+
+def extract_file_features(elf, buf):
+    """
+    extract file features from given sample
+
+    args:
+      elf (elftools.elf.elffile.ELFFile): the parsed ELFFile
+      buf: the raw sample bytes
+
+    yields:
+      Tuple[Feature, VA]: a feature and its location.
+    """
+
+    for file_handler in FILE_HANDLERS:
+        for feature, va in file_handler(elf=elf, buf=buf):
+            yield feature, va
+
+
+FILE_HANDLERS = (
+    # TODO extract file export names
+    # extract_file_export_names,
+    extract_file_import_names,
+    extract_file_section_names,
+    extract_file_strings,
+    # elffile doesn't have library matching
+    # extract_file_function_names,
+    extract_file_os,
+    extract_file_format,
+    extract_file_arch,
+)
+
+
+class ElfFeatureExtractor(FeatureExtractor):
+    def __init__(self, path: str):
+        super(ElfFeatureExtractor, self).__init__()
+        self.path = path
+        # TODO close where/when?
+        self.elf = ELFFile(open(self.path, "rb"))
+
+    def get_base_address(self):
+        # virtual address of the first segment with type LOAD
+        for segment in self.elf.iter_segments():
+            if segment.header.p_type == "PT_LOAD":
+                return segment.header.p_vaddr
+
+    def extract_file_features(self):
+        with open(self.path, "rb") as f:
+            buf = f.read()
+
+        for feature, va in extract_file_features(self.elf, buf):
+            yield feature, va
+
+    def get_functions(self):
+        raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
+
+    def extract_function_features(self, f):
+        raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
+
+    def get_basic_blocks(self, f):
+        raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
+
+    def extract_basic_block_features(self, f, bb):
+        raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
+
+    def get_instructions(self, f, bb):
+        raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
+
+    def extract_insn_features(self, f, bb, insn):
+        raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
+
+    def is_library_function(self, va):
+        raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
+
+    def get_function_name(self, va):
+        raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
diff --git a/capa/features/extractors/pefile.py b/capa/features/extractors/pefile.py
index 3e5d97fd..91745f9f 100644
--- a/capa/features/extractors/pefile.py
+++ b/capa/features/extractors/pefile.py
@@ -5,16 +5,18 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+
 import logging
 
 import pefile
 
 import capa.features.common
 import capa.features.extractors
+import capa.features.extractors.common
 import capa.features.extractors.helpers
 import capa.features.extractors.strings
 from capa.features.file import Export, Import, Section
-from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, String, Characteristic
+from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Characteristic
 from capa.features.extractors.base_extractor import FeatureExtractor
 
 logger = logging.getLogger(__name__)
@@ -85,14 +87,7 @@ def extract_file_section_names(pe, **kwargs):
 
 
 def extract_file_strings(buf, **kwargs):
-    """
-    extract ASCII and UTF-16 LE strings from file
-    """
-    for s in capa.features.extractors.strings.extract_ascii_strings(buf):
-        yield String(s.s), s.offset
-
-    for s in capa.features.extractors.strings.extract_unicode_strings(buf):
-        yield String(s.s), s.offset
+    yield from capa.features.extractors.common.extract_file_strings(buf)
 
 
 def extract_file_function_names(**kwargs):
diff --git a/capa/features/extractors/viv/file.py b/capa/features/extractors/viv/file.py
index 4b9cd13f..b0e20032 100644
--- a/capa/features/extractors/viv/file.py
+++ b/capa/features/extractors/viv/file.py
@@ -68,14 +68,7 @@ def extract_file_section_names(vw, **kwargs):
 
 
 def extract_file_strings(buf, **kwargs):
-    """
-    extract ASCII and UTF-16 LE strings from file
-    """
-    for s in capa.features.extractors.strings.extract_ascii_strings(buf):
-        yield String(s.s), s.offset
-
-    for s in capa.features.extractors.strings.extract_unicode_strings(buf):
-        yield String(s.s), s.offset
+    yield from capa.features.extractors.common.extract_file_strings(buf)
 
 
 def extract_file_function_names(vw, **kwargs):
diff --git a/capa/main.py b/capa/main.py
index 2b91fe26..7c804021 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -39,6 +39,7 @@ import capa.render.vverbose
 import capa.features.extractors
 import capa.features.extractors.common
 import capa.features.extractors.pefile
+import capa.features.extractors.elffile
 from capa.rules import Rule, RuleSet
 from capa.engine import FeatureSet, MatchResults
 from capa.helpers import get_file_taste
@@ -942,8 +943,8 @@ def main(argv=None):
         return -1
 
     if args.format == "pe" or (args.format == "auto" and taste.startswith(b"MZ")):
-        # this pefile file feature extractor is pretty light weight: it doesn't do any code analysis.
-        # so we can fairly quickly determine if the given PE file has "pure" file-scope rules
+        # these pefile and elffile file feature extractors are pretty light weight: they don't do any code analysis.
+        # so we can fairly quickly determine if the given file has "pure" file-scope rules
         # that indicate a limitation (like "file is packed based on section names")
         # and avoid doing a full code analysis on difficult/impossible binaries.
         try:
@@ -953,6 +954,17 @@ def main(argv=None):
         except PEFormatError as e:
             logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e))
             return -1
+
+    elif args.format == "elf" or (args.format == "auto" and taste.startswith(b"\x7fELF")):
+        try:
+            from elftools.common.exceptions import ELFError
+
+            file_extractor = capa.features.extractors.elffile.ElfFeatureExtractor(args.sample)
+        except ELFError as e:
+            logger.error("Input file '%s' is not a valid ELF file: %s", args.sample, str(e))
+            return -1
+
+    if file_extractor:
         pure_file_capabilities, _ = find_file_capabilities(rules, file_extractor, {})
 
         # file limitations that rely on non-file scope won't be detected here.
diff --git a/setup.py b/setup.py
index 51a460f1..6d1108f8 100644
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,7 @@ requirements = [
     "smda==1.6.2",
     "pefile==2021.9.3",
     "typing==3.7.4.3",
+    "pyelftools==0.27",
 ]
 
 # this sets __version__