mirror of
https://github.com/mandiant/capa.git
synced 2025-12-22 15:16:22 -08:00
Merge pull request #770 from fireeye/elffile-extractor
add light weight ElfFeatureExtractor
This commit is contained in:
3
.github/mypy/mypy.ini
vendored
3
.github/mypy/mypy.ini
vendored
@@ -71,3 +71,6 @@ ignore_missing_imports = True
|
|||||||
|
|
||||||
[mypy-devtools.*]
|
[mypy-devtools.*]
|
||||||
ignore_missing_imports = True
|
ignore_missing_imports = True
|
||||||
|
|
||||||
|
[mypy-elftools.*]
|
||||||
|
ignore_missing_imports = True
|
||||||
|
|||||||
@@ -10,6 +10,7 @@
|
|||||||
- rule format: add feature `os: ` for operating system, like `os: windows` #723 @williballenthin
|
- rule format: add feature `os: ` for operating system, like `os: windows` #723 @williballenthin
|
||||||
- rule format: add feature `substring: ` for verbatim strings with leading/trailing wildcards #737 @williballenthin
|
- rule format: add feature `substring: ` for verbatim strings with leading/trailing wildcards #737 @williballenthin
|
||||||
- scripts: add `profile-memory.py` for profiling memory usage #736 @williballenthin
|
- scripts: add `profile-memory.py` for profiling memory usage #736 @williballenthin
|
||||||
|
- main: add light weight ELF file feature extractor to detect file limitations #770 @mr-tz
|
||||||
|
|
||||||
### Breaking Changes
|
### Breaking Changes
|
||||||
|
|
||||||
|
|||||||
@@ -344,7 +344,6 @@ VALID_ARCH = (ARCH_I386, ARCH_AMD64)
|
|||||||
|
|
||||||
class Arch(Feature):
|
class Arch(Feature):
|
||||||
def __init__(self, value: str, description=None):
|
def __init__(self, value: str, description=None):
|
||||||
assert value in VALID_ARCH
|
|
||||||
super(Arch, self).__init__(value, description=description)
|
super(Arch, self).__init__(value, description=description)
|
||||||
self.name = "arch"
|
self.name = "arch"
|
||||||
|
|
||||||
@@ -358,7 +357,6 @@ VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS})
|
|||||||
|
|
||||||
class OS(Feature):
|
class OS(Feature):
|
||||||
def __init__(self, value: str, description=None):
|
def __init__(self, value: str, description=None):
|
||||||
assert value in (VALID_OS)
|
|
||||||
super(OS, self).__init__(value, description=description)
|
super(OS, self).__init__(value, description=description)
|
||||||
self.name = "os"
|
self.name = "os"
|
||||||
|
|
||||||
@@ -370,7 +368,6 @@ VALID_FORMAT = (FORMAT_PE, FORMAT_ELF)
|
|||||||
|
|
||||||
class Format(Feature):
|
class Format(Feature):
|
||||||
def __init__(self, value: str, description=None):
|
def __init__(self, value: str, description=None):
|
||||||
assert value in (VALID_FORMAT)
|
|
||||||
super(Format, self).__init__(value, description=description)
|
super(Format, self).__init__(value, description=description)
|
||||||
self.name = "format"
|
self.name = "format"
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
import abc
|
import abc
|
||||||
from typing import Tuple, Iterator, SupportsInt
|
from typing import Tuple, Iterator, SupportsInt
|
||||||
|
|
||||||
from capa.features.basicblock import Feature
|
from capa.features.common import Feature
|
||||||
|
|
||||||
# feature extractors may reference functions, BBs, insns by opaque handle values.
|
# feature extractors may reference functions, BBs, insns by opaque handle values.
|
||||||
# the only requirement of these handles are that they support `__int__`,
|
# the only requirement of these handles are that they support `__int__`,
|
||||||
|
|||||||
@@ -5,13 +5,25 @@ import contextlib
|
|||||||
|
|
||||||
import pefile
|
import pefile
|
||||||
|
|
||||||
|
import capa.features
|
||||||
import capa.features.extractors.elf
|
import capa.features.extractors.elf
|
||||||
import capa.features.extractors.pefile
|
import capa.features.extractors.pefile
|
||||||
from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, Arch, Format
|
from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, Arch, Format, String
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_file_strings(buf, **kwargs):
|
||||||
|
"""
|
||||||
|
extract ASCII and UTF-16 LE strings from file
|
||||||
|
"""
|
||||||
|
for s in capa.features.extractors.strings.extract_ascii_strings(buf):
|
||||||
|
yield String(s.s), s.offset
|
||||||
|
|
||||||
|
for s in capa.features.extractors.strings.extract_unicode_strings(buf):
|
||||||
|
yield String(s.s), s.offset
|
||||||
|
|
||||||
|
|
||||||
def extract_format(buf):
|
def extract_format(buf):
|
||||||
if buf.startswith(b"MZ"):
|
if buf.startswith(b"MZ"):
|
||||||
yield Format(FORMAT_PE), 0x0
|
yield Format(FORMAT_PE), 0x0
|
||||||
@@ -34,7 +46,7 @@ def extract_arch(buf):
|
|||||||
with contextlib.closing(io.BytesIO(buf)) as f:
|
with contextlib.closing(io.BytesIO(buf)) as f:
|
||||||
arch = capa.features.extractors.elf.detect_elf_arch(f)
|
arch = capa.features.extractors.elf.detect_elf_arch(f)
|
||||||
|
|
||||||
if arch == "unknown":
|
if arch not in capa.features.common.VALID_ARCH:
|
||||||
logger.debug("unsupported arch: %s", arch)
|
logger.debug("unsupported arch: %s", arch)
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -62,7 +74,12 @@ def extract_os(buf):
|
|||||||
with contextlib.closing(io.BytesIO(buf)) as f:
|
with contextlib.closing(io.BytesIO(buf)) as f:
|
||||||
os = capa.features.extractors.elf.detect_elf_os(f)
|
os = capa.features.extractors.elf.detect_elf_os(f)
|
||||||
|
|
||||||
|
if os not in capa.features.common.VALID_OS:
|
||||||
|
logger.debug("unsupported os: %s", os)
|
||||||
|
return
|
||||||
|
|
||||||
yield OS(os), 0x0
|
yield OS(os), 0x0
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# we likely end up here:
|
# we likely end up here:
|
||||||
# 1. handling shellcode, or
|
# 1. handling shellcode, or
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
# Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at: [package root]/LICENSE.txt
|
||||||
|
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||||
|
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and limitations under the License.
|
||||||
import struct
|
import struct
|
||||||
import logging
|
import logging
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|||||||
142
capa/features/extractors/elffile.py
Normal file
142
capa/features/extractors/elffile.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
# Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at: [package root]/LICENSE.txt
|
||||||
|
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||||
|
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and limitations under the License.
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import contextlib
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from elftools.elf.elffile import ELFFile, SymbolTableSection
|
||||||
|
|
||||||
|
import capa.features.extractors.common
|
||||||
|
from capa.features.file import Import, Section
|
||||||
|
from capa.features.common import OS, FORMAT_ELF, Arch, Format, Feature
|
||||||
|
from capa.features.extractors.elf import Arch as ElfArch
|
||||||
|
from capa.features.extractors.base_extractor import FeatureExtractor
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_file_import_names(elf, **kwargs):
|
||||||
|
# see https://github.com/eliben/pyelftools/blob/0664de05ed2db3d39041e2d51d19622a8ef4fb0f/scripts/readelf.py#L372
|
||||||
|
symbol_tables = [(idx, s) for idx, s in enumerate(elf.iter_sections()) if isinstance(s, SymbolTableSection)]
|
||||||
|
|
||||||
|
for section_index, section in symbol_tables:
|
||||||
|
if not isinstance(section, SymbolTableSection):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if section["sh_entsize"] == 0:
|
||||||
|
logger.debug("Symbol table '%s' has a sh_entsize of zero!" % (section.name))
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.debug("Symbol table '%s' contains %s entries:" % (section.name, section.num_symbols()))
|
||||||
|
|
||||||
|
for nsym, symbol in enumerate(section.iter_symbols()):
|
||||||
|
if symbol.name and symbol.entry.st_info.type == "STT_FUNC":
|
||||||
|
# TODO symbol address
|
||||||
|
# TODO symbol version info?
|
||||||
|
yield Import(symbol.name), 0x0
|
||||||
|
|
||||||
|
|
||||||
|
def extract_file_section_names(elf, **kwargs):
|
||||||
|
for section in elf.iter_sections():
|
||||||
|
if section.name:
|
||||||
|
yield Section(section.name), section.header.sh_addr
|
||||||
|
elif section.is_null():
|
||||||
|
yield Section("NULL"), section.header.sh_addr
|
||||||
|
|
||||||
|
|
||||||
|
def extract_file_strings(buf, **kwargs):
|
||||||
|
yield from capa.features.extractors.common.extract_file_strings(buf)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_file_os(elf, buf, **kwargs):
|
||||||
|
# our current approach does not always get an OS value, e.g. for packed samples
|
||||||
|
# for file limitation purposes, we're more lax here
|
||||||
|
try:
|
||||||
|
os = next(capa.features.extractors.common.extract_os(buf))
|
||||||
|
yield os
|
||||||
|
except StopIteration:
|
||||||
|
yield OS("unknown"), 0x0
|
||||||
|
|
||||||
|
|
||||||
|
def extract_file_format(**kwargs):
|
||||||
|
yield Format(FORMAT_ELF), 0x0
|
||||||
|
|
||||||
|
|
||||||
|
def extract_file_arch(elf, **kwargs):
|
||||||
|
# TODO merge with capa.features.extractors.elf.detect_elf_arch()
|
||||||
|
arch = elf.get_machine_arch()
|
||||||
|
if arch == "x86":
|
||||||
|
yield Arch(ElfArch.I386), 0x0
|
||||||
|
elif arch == "x64":
|
||||||
|
yield Arch(ElfArch.AMD64), 0x0
|
||||||
|
else:
|
||||||
|
logger.warning("unsupported architecture: %s", arch)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_file_features(elf: ELFFile, buf: bytes) -> Tuple[Feature, int]:
|
||||||
|
for file_handler in FILE_HANDLERS:
|
||||||
|
for feature, va in file_handler(elf=elf, buf=buf):
|
||||||
|
yield feature, va
|
||||||
|
|
||||||
|
|
||||||
|
FILE_HANDLERS = (
|
||||||
|
# TODO extract_file_export_names,
|
||||||
|
extract_file_import_names,
|
||||||
|
extract_file_section_names,
|
||||||
|
extract_file_strings,
|
||||||
|
# no library matching
|
||||||
|
extract_file_os,
|
||||||
|
extract_file_format,
|
||||||
|
extract_file_arch,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ElfFeatureExtractor(FeatureExtractor):
|
||||||
|
def __init__(self, path: str):
|
||||||
|
super(ElfFeatureExtractor, self).__init__()
|
||||||
|
self.path = path
|
||||||
|
with open(self.path, "rb") as f:
|
||||||
|
self.elf = ELFFile(io.BytesIO(f.read()))
|
||||||
|
|
||||||
|
def get_base_address(self):
|
||||||
|
# virtual address of the first segment with type LOAD
|
||||||
|
for segment in self.elf.iter_segments():
|
||||||
|
if segment.header.p_type == "PT_LOAD":
|
||||||
|
return segment.header.p_vaddr
|
||||||
|
|
||||||
|
def extract_file_features(self):
|
||||||
|
with open(self.path, "rb") as f:
|
||||||
|
buf = f.read()
|
||||||
|
|
||||||
|
for feature, va in extract_file_features(self.elf, buf):
|
||||||
|
yield feature, va
|
||||||
|
|
||||||
|
def get_functions(self):
|
||||||
|
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
|
||||||
|
|
||||||
|
def extract_function_features(self, f):
|
||||||
|
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
|
||||||
|
|
||||||
|
def get_basic_blocks(self, f):
|
||||||
|
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
|
||||||
|
|
||||||
|
def extract_basic_block_features(self, f, bb):
|
||||||
|
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
|
||||||
|
|
||||||
|
def get_instructions(self, f, bb):
|
||||||
|
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
|
||||||
|
|
||||||
|
def extract_insn_features(self, f, bb, insn):
|
||||||
|
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
|
||||||
|
|
||||||
|
def is_library_function(self, va):
|
||||||
|
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
|
||||||
|
|
||||||
|
def get_function_name(self, va):
|
||||||
|
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
|
||||||
@@ -5,16 +5,18 @@
|
|||||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and limitations under the License.
|
# See the License for the specific language governing permissions and limitations under the License.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import pefile
|
import pefile
|
||||||
|
|
||||||
import capa.features.common
|
import capa.features.common
|
||||||
import capa.features.extractors
|
import capa.features.extractors
|
||||||
|
import capa.features.extractors.common
|
||||||
import capa.features.extractors.helpers
|
import capa.features.extractors.helpers
|
||||||
import capa.features.extractors.strings
|
import capa.features.extractors.strings
|
||||||
from capa.features.file import Export, Import, Section
|
from capa.features.file import Export, Import, Section
|
||||||
from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, String, Characteristic
|
from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Characteristic
|
||||||
from capa.features.extractors.base_extractor import FeatureExtractor
|
from capa.features.extractors.base_extractor import FeatureExtractor
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -85,14 +87,7 @@ def extract_file_section_names(pe, **kwargs):
|
|||||||
|
|
||||||
|
|
||||||
def extract_file_strings(buf, **kwargs):
|
def extract_file_strings(buf, **kwargs):
|
||||||
"""
|
yield from capa.features.extractors.common.extract_file_strings(buf)
|
||||||
extract ASCII and UTF-16 LE strings from file
|
|
||||||
"""
|
|
||||||
for s in capa.features.extractors.strings.extract_ascii_strings(buf):
|
|
||||||
yield String(s.s), s.offset
|
|
||||||
|
|
||||||
for s in capa.features.extractors.strings.extract_unicode_strings(buf):
|
|
||||||
yield String(s.s), s.offset
|
|
||||||
|
|
||||||
|
|
||||||
def extract_file_function_names(**kwargs):
|
def extract_file_function_names(**kwargs):
|
||||||
|
|||||||
@@ -68,14 +68,7 @@ def extract_file_section_names(vw, **kwargs):
|
|||||||
|
|
||||||
|
|
||||||
def extract_file_strings(buf, **kwargs):
|
def extract_file_strings(buf, **kwargs):
|
||||||
"""
|
yield from capa.features.extractors.common.extract_file_strings(buf)
|
||||||
extract ASCII and UTF-16 LE strings from file
|
|
||||||
"""
|
|
||||||
for s in capa.features.extractors.strings.extract_ascii_strings(buf):
|
|
||||||
yield String(s.s), s.offset
|
|
||||||
|
|
||||||
for s in capa.features.extractors.strings.extract_unicode_strings(buf):
|
|
||||||
yield String(s.s), s.offset
|
|
||||||
|
|
||||||
|
|
||||||
def extract_file_function_names(vw, **kwargs):
|
def extract_file_function_names(vw, **kwargs):
|
||||||
|
|||||||
17
capa/main.py
17
capa/main.py
@@ -39,6 +39,7 @@ import capa.render.vverbose
|
|||||||
import capa.features.extractors
|
import capa.features.extractors
|
||||||
import capa.features.extractors.common
|
import capa.features.extractors.common
|
||||||
import capa.features.extractors.pefile
|
import capa.features.extractors.pefile
|
||||||
|
import capa.features.extractors.elffile
|
||||||
from capa.rules import Rule, RuleSet
|
from capa.rules import Rule, RuleSet
|
||||||
from capa.engine import FeatureSet, MatchResults
|
from capa.engine import FeatureSet, MatchResults
|
||||||
from capa.helpers import get_file_taste
|
from capa.helpers import get_file_taste
|
||||||
@@ -945,9 +946,10 @@ def main(argv=None):
|
|||||||
logger.error("%s", str(e))
|
logger.error("%s", str(e))
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
file_extractor = None
|
||||||
if args.format == "pe" or (args.format == "auto" and taste.startswith(b"MZ")):
|
if args.format == "pe" or (args.format == "auto" and taste.startswith(b"MZ")):
|
||||||
# this pefile file feature extractor is pretty light weight: it doesn't do any code analysis.
|
# these pefile and elffile file feature extractors are pretty light weight: they don't do any code analysis.
|
||||||
# so we can fairly quickly determine if the given PE file has "pure" file-scope rules
|
# so we can fairly quickly determine if the given file has "pure" file-scope rules
|
||||||
# that indicate a limitation (like "file is packed based on section names")
|
# that indicate a limitation (like "file is packed based on section names")
|
||||||
# and avoid doing a full code analysis on difficult/impossible binaries.
|
# and avoid doing a full code analysis on difficult/impossible binaries.
|
||||||
try:
|
try:
|
||||||
@@ -957,6 +959,17 @@ def main(argv=None):
|
|||||||
except PEFormatError as e:
|
except PEFormatError as e:
|
||||||
logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e))
|
logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e))
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
elif args.format == "elf" or (args.format == "auto" and taste.startswith(b"\x7fELF")):
|
||||||
|
try:
|
||||||
|
from elftools.common.exceptions import ELFError
|
||||||
|
|
||||||
|
file_extractor = capa.features.extractors.elffile.ElfFeatureExtractor(args.sample)
|
||||||
|
except ELFError as e:
|
||||||
|
logger.error("Input file '%s' is not a valid ELF file: %s", args.sample, str(e))
|
||||||
|
return -1
|
||||||
|
|
||||||
|
if file_extractor:
|
||||||
pure_file_capabilities, _ = find_file_capabilities(rules, file_extractor, {})
|
pure_file_capabilities, _ = find_file_capabilities(rules, file_extractor, {})
|
||||||
|
|
||||||
# file limitations that rely on non-file scope won't be detected here.
|
# file limitations that rely on non-file scope won't be detected here.
|
||||||
|
|||||||
@@ -260,6 +260,7 @@ def parse_feature(key: str):
|
|||||||
elif key == "format":
|
elif key == "format":
|
||||||
return capa.features.common.Format
|
return capa.features.common.Format
|
||||||
elif key == "arch":
|
elif key == "arch":
|
||||||
|
|
||||||
return capa.features.common.Arch
|
return capa.features.common.Arch
|
||||||
else:
|
else:
|
||||||
raise InvalidRule("unexpected statement: %s" % key)
|
raise InvalidRule("unexpected statement: %s" % key)
|
||||||
@@ -471,6 +472,12 @@ def build_statements(d, scope: str):
|
|||||||
raise InvalidRule("unexpected range: %s" % (count))
|
raise InvalidRule("unexpected range: %s" % (count))
|
||||||
elif key == "string" and not isinstance(d[key], str):
|
elif key == "string" and not isinstance(d[key], str):
|
||||||
raise InvalidRule("ambiguous string value %s, must be defined as explicit string" % d[key])
|
raise InvalidRule("ambiguous string value %s, must be defined as explicit string" % d[key])
|
||||||
|
elif (
|
||||||
|
(key == "os" and d[key] not in capa.features.common.VALID_OS)
|
||||||
|
or (key == "format" and d[key] not in capa.features.common.VALID_FORMAT)
|
||||||
|
or (key == "arch" and d[key] not in capa.features.common.VALID_ARCH)
|
||||||
|
):
|
||||||
|
raise InvalidRule("unexpected %s value %s" % (key, d[key]))
|
||||||
else:
|
else:
|
||||||
Feature = parse_feature(key)
|
Feature = parse_feature(key)
|
||||||
value, description = parse_description(d[key], key, d.get("description"))
|
value, description = parse_description(d[key], key, d.get("description"))
|
||||||
|
|||||||
Reference in New Issue
Block a user