dotnet: basic detection and feature extraction (#987)

2025-12-22 07:10:29 -08:00 · 2022-04-08 14:55:00 -06:00
parent 5bc44aef0f
commit 580a2d7e45
26 changed files with 928 additions and 165 deletions
--- a/.github/mypy/mypy.ini
+++ b/.github/mypy/mypy.ini
@@ -74,3 +74,6 @@ ignore_missing_imports = True
 [mypy-elftools.*]
 ignore_missing_imports = True
 [mypy-dncil.*]
 ignore_missing_imports = True
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -48,7 +48,7 @@ jobs:
    - name: Checkout capa with submodules
      uses: actions/checkout@v2
      with:
-        submodules: true
+        submodules: recursive
    - name: Set up Python 3.8
      uses: actions/setup-python@v2
      with:
@@ -78,7 +78,7 @@ jobs:
    - name: Checkout capa with submodules
      uses: actions/checkout@v2
      with:
-        submodules: true
+        submodules: recursive
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@
 - add new feature "operand[{0, 1, 2}].number" for matching instruction operand immediate values #767 @williballenthin
 - add new feature "operand[{0, 1, 2}].offset" for matching instruction operand offsets #767 @williballenthin
 - extract additional offset/number features in certain circumstances #320 @williballenthin
 - add detection and basic feature extraction for dotnet #987 @mr-tz, @mike-hunhoff, @williballenthin
 ### Breaking Changes
--- a/capa/exceptions.py
+++ b/capa/exceptions.py
@@ -0,0 +1,14 @@
 class UnsupportedRuntimeError(RuntimeError):
    pass
 class UnsupportedFormatError(ValueError):
    pass
 class UnsupportedArchError(ValueError):
    pass
 class UnsupportedOSError(ValueError):
    pass
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -390,7 +390,9 @@ class Bytes(Feature):
 # other candidates here: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#machine-types
 ARCH_I386 = "i386"
 ARCH_AMD64 = "amd64"
-VALID_ARCH = (ARCH_I386, ARCH_AMD64)
+# dotnet
 ARCH_ANY = "any"
 VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY)
 class Arch(Feature):
@@ -402,8 +404,10 @@ class Arch(Feature):
 OS_WINDOWS = "windows"
 OS_LINUX = "linux"
 OS_MACOS = "macos"
 # dotnet
 OS_ANY = "any"
 VALID_OS = {os.value for os in capa.features.extractors.elf.OS}
-VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS})
+VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS, OS_ANY})
 class OS(Feature):
@@ -414,7 +418,14 @@ class OS(Feature):
 FORMAT_PE = "pe"
 FORMAT_ELF = "elf"
-VALID_FORMAT = (FORMAT_PE, FORMAT_ELF)
+FORMAT_DOTNET = "dotnet"
 VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET)
 # internal only, not to be used in rules
 FORMAT_AUTO = "auto"
 FORMAT_SC32 = "sc32"
 FORMAT_SC64 = "sc64"
 FORMAT_FREEZE = "freeze"
 FORMAT_UNKNOWN = "unknown"
 class Format(Feature):
--- a/capa/features/extractors/common.py
+++ b/capa/features/extractors/common.py
@@ -8,7 +8,8 @@ import pefile
 import capa.features
 import capa.features.extractors.elf
 import capa.features.extractors.pefile
-from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, Arch, Format, String
+from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, FORMAT_FREEZE, Arch, Format, String
 from capa.features.freeze import is_freeze
 logger = logging.getLogger(__name__)
@@ -29,6 +30,8 @@ def extract_format(buf):
        yield Format(FORMAT_PE), 0x0
    elif buf.startswith(b"\x7fELF"):
        yield Format(FORMAT_ELF), 0x0
    elif is_freeze(buf):
        yield Format(FORMAT_FREEZE), 0x0
    else:
        # we likely end up here:
        #  1. handling a file format (e.g. macho)
--- a/capa/features/extractors/dnfile/init.py
+++ b/capa/features/extractors/dnfile/init.py
--- a/capa/features/extractors/dnfile/extractor.py
+++ b/capa/features/extractors/dnfile/extractor.py
@@ -0,0 +1,70 @@
 # Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 from __future__ import annotations
 from typing import TYPE_CHECKING, Any, List, Tuple
 if TYPE_CHECKING:
    from capa.features.common import Feature
 import dnfile
 import capa.features.extractors
 import capa.features.extractors.dnfile.file
 import capa.features.extractors.dnfile.insn
 from capa.features.extractors.base_extractor import FeatureExtractor
 from capa.features.extractors.dnfile.helpers import get_dotnet_managed_method_bodies
 class DnfileFeatureExtractor(FeatureExtractor):
    def __init__(self, path: str):
        super(DnfileFeatureExtractor, self).__init__()
        self.pe: dnfile.dnPE = dnfile.dnPE(path)
        # pre-compute these because we'll yield them at *every* scope.
        self.global_features: List[Tuple[Feature, int]] = []
        self.global_features.extend(capa.features.extractors.dotnetfile.extract_file_os(pe=self.pe))
        self.global_features.extend(capa.features.extractors.dotnetfile.extract_file_arch(pe=self.pe))
    def get_base_address(self):
        return 0x0
    def extract_global_features(self):
        yield from self.global_features
    def extract_file_features(self):
        yield from capa.features.extractors.dnfile.file.extract_features(self.pe)
    def get_functions(self):
        # data structure shared across functions yielded here.
        # useful for caching analysis relevant across a single workspace.
        ctx = {}
        ctx["pe"] = self.pe
        for f in get_dotnet_managed_method_bodies(self.pe):
            setattr(f, "ctx", ctx)
            yield f
    def extract_function_features(self, f):
        # TODO
        yield from []
    def get_basic_blocks(self, f):
        # each dotnet method is considered 1 basic block
        yield f
    def extract_basic_block_features(self, f, bb):
        # we don't support basic block features
        yield from []
    def get_instructions(self, f, bb):
        yield from f.instructions
    def extract_insn_features(self, f, bb, insn):
        yield from capa.features.extractors.dnfile.insn.extract_features(f, bb, insn)
--- a/capa/features/extractors/dnfile/file.py
+++ b/capa/features/extractors/dnfile/file.py
@@ -0,0 +1,40 @@
 # Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 from __future__ import annotations
 from typing import TYPE_CHECKING, Tuple, Iterator
 if TYPE_CHECKING:
    import dnfile
    from capa.features.common import Feature, Format
    from capa.features.file import Import
 import capa.features.extractors
 def extract_file_import_names(pe: dnfile.dnPE) -> Iterator[Tuple[Import, int]]:
    yield from capa.features.extractors.dotnetfile.extract_file_import_names(pe)
 def extract_file_format(pe: dnfile.dnPE) -> Iterator[Tuple[Format, int]]:
    yield from capa.features.extractors.dotnetfile.extract_file_format(pe=pe)
 def extract_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, int]]:
    for file_handler in FILE_HANDLERS:
        for (feature, token) in file_handler(pe):
            yield feature, token
 FILE_HANDLERS = (
    extract_file_import_names,
    # TODO extract_file_strings,
    # TODO extract_file_function_names,
    extract_file_format,
 )
--- a/capa/features/extractors/dnfile/helpers.py
+++ b/capa/features/extractors/dnfile/helpers.py
@@ -0,0 +1,169 @@
 # Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 from __future__ import annotations
 import logging
 from typing import Any, Tuple, Iterator, Optional
 import dnfile
 from dncil.cil.body import CilMethodBody
 from dncil.cil.error import MethodBodyFormatError
 from dncil.clr.token import Token, StringToken, InvalidToken
 from dncil.cil.body.reader import CilMethodBodyReaderBase
 logger = logging.getLogger(__name__)
 # key indexes to dotnet metadata tables
 DOTNET_META_TABLES_BY_INDEX = {table.value: table.name for table in dnfile.enums.MetadataTables}
 class DnfileMethodBodyReader(CilMethodBodyReaderBase):
    def __init__(self, pe: dnfile.dnPE, row: dnfile.mdtable.MethodDefRow):
        self.pe: dnfile.dnPE = pe
        self.offset: int = self.pe.get_offset_from_rva(row.Rva)
    def read(self, n: int) -> bytes:
        data: bytes = self.pe.get_data(self.pe.get_rva_from_offset(self.offset), n)
        self.offset += n
        return data
    def tell(self) -> int:
        return self.offset
    def seek(self, offset: int) -> int:
        self.offset = offset
        return self.offset
 def calculate_dotnet_token_value(table: int, rid: int) -> int:
    return ((table & 0xFF) << Token.TABLE_SHIFT) | (rid & Token.RID_MASK)
 def resolve_dotnet_token(pe: dnfile.dnPE, token: Token) -> Any:
    """map generic token to string or table row"""
    if isinstance(token, StringToken):
        user_string: Optional[str] = read_dotnet_user_string(pe, token)
        if user_string is None:
            return InvalidToken(token.value)
        return user_string
    table_name: str = DOTNET_META_TABLES_BY_INDEX.get(token.table, "")
    if not table_name:
        # table_index is not valid
        return InvalidToken(token.value)
    table: Any = getattr(pe.net.mdtables, table_name, None)
    if table is None:
        # table index is valid but table is not present
        return InvalidToken(token.value)
    try:
        return table.rows[token.rid - 1]
    except IndexError:
        # table index is valid but row index is not valid
        return InvalidToken(token.value)
 def read_dotnet_method_body(pe: dnfile.dnPE, row: dnfile.mdtable.MethodDefRow) -> Optional[CilMethodBody]:
    """read dotnet method body"""
    try:
        return CilMethodBody(DnfileMethodBodyReader(pe, row))
    except MethodBodyFormatError as e:
        logger.warn("failed to parse managed method body @ 0x%08x (%s)" % (row.Rva, e))
        return None
 def read_dotnet_user_string(pe: dnfile.dnPE, token: StringToken) -> Optional[str]:
    """read user string from #US stream"""
    try:
        user_string: Optional[dnfile.stream.UserString] = pe.net.user_strings.get_us(token.rid)
    except UnicodeDecodeError as e:
        logger.warn("failed to decode #US stream index 0x%08x (%s)" % (token.rid, e))
        return None
    if user_string is None:
        return None
    return user_string.value
 def get_dotnet_managed_imports(pe: dnfile.dnPE) -> Iterator[Tuple[int, str]]:
    """get managed imports from MemberRef table
    see https://www.ntcore.com/files/dotnetformat.htm
    10 - MemberRef Table
        Each row represents an imported method
            Class (index into the TypeRef, ModuleRef, MethodDef, TypeSpec or TypeDef tables)
            Name (index into String heap)
    01 - TypeRef Table
        Each row represents an imported class, its namespace and the assembly which contains it
            TypeName (index into String heap)
            TypeNamespace (index into String heap)
    """
    if not hasattr(pe.net.mdtables, "MemberRef"):
        return
    for (rid, row) in enumerate(pe.net.mdtables.MemberRef):
        if not isinstance(row.Class.row, (dnfile.mdtable.TypeRefRow,)):
            continue
        token: int = calculate_dotnet_token_value(dnfile.enums.MetadataTables.MemberRef.value, rid + 1)
        # like System.IO.File::OpenRead
        imp: str = f"{row.Class.row.TypeNamespace}.{row.Class.row.TypeName}::{row.Name}"
        yield token, imp
 def get_dotnet_unmanaged_imports(pe: dnfile.dnPE) -> Iterator[Tuple[int, str]]:
    """get unmanaged imports from ImplMap table
    see https://www.ntcore.com/files/dotnetformat.htm
    28 - ImplMap Table
        ImplMap table holds information about unmanaged methods that can be reached from managed code, using PInvoke dispatch
            MemberForwarded (index into the Field or MethodDef table; more precisely, a MemberForwarded coded index)
            ImportName (index into the String heap)
            ImportScope (index into the ModuleRef table)
    """
    if not hasattr(pe.net.mdtables, "ImplMap"):
        return
    for row in pe.net.mdtables.ImplMap:
        dll: str = row.ImportScope.row.Name
        symbol: str = row.ImportName
        # ECMA says "Each row of the ImplMap table associates a row in the MethodDef table (MemberForwarded) with the
        # name of a routine (ImportName) in some unmanaged DLL (ImportScope)"; so we calculate and map the MemberForwarded
        # MethodDef table token to help us later record native import method calls made from CIL
        token: int = calculate_dotnet_token_value(row.MemberForwarded.table.number, row.MemberForwarded.row_index)
        # like Kernel32.dll
        if dll and "." in dll:
            dll = dll.split(".")[0]
        # like kernel32.CreateFileA
        imp: str = f"{dll}.{symbol}"
        yield token, imp
 def get_dotnet_managed_method_bodies(pe: dnfile.dnPE) -> Iterator[CilMethodBody]:
    """get managed methods from MethodDef table"""
    if not hasattr(pe.net.mdtables, "MethodDef"):
        return
    for row in pe.net.mdtables.MethodDef:
        if not row.ImplFlags.miIL or any((row.Flags.mdAbstract, row.Flags.mdPinvokeImpl)):
            # skip methods that do not have a method body
            continue
        body: Optional[CilMethodBody] = read_dotnet_method_body(pe, row)
        if body is None:
            continue
        yield body
--- a/capa/features/extractors/dnfile/insn.py
+++ b/capa/features/extractors/dnfile/insn.py
@@ -0,0 +1,96 @@
 # Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 from __future__ import annotations
 from typing import TYPE_CHECKING, Dict, Tuple, Iterator, Optional
 from itertools import chain
 if TYPE_CHECKING:
    from dncil.cil.instruction import Instruction
    from dncil.cil.body import CilMethodBody
    from capa.features.common import Feature
 from dncil.clr.token import StringToken
 from dncil.cil.opcode import OpCodes
 import capa.features.extractors.helpers
 from capa.features.insn import API, Number
 from capa.features.common import String
 from capa.features.extractors.dnfile.helpers import (
    read_dotnet_user_string,
    get_dotnet_managed_imports,
    get_dotnet_unmanaged_imports,
 )
 def get_imports(ctx: Dict) -> Dict:
    if "imports_cache" not in ctx:
        ctx["imports_cache"] = {
            token: imp
            for (token, imp) in chain(get_dotnet_managed_imports(ctx["pe"]), get_dotnet_unmanaged_imports(ctx["pe"]))
        }
    return ctx["imports_cache"]
 def extract_insn_api_features(f: CilMethodBody, bb: CilMethodBody, insn: Instruction) -> Iterator[Tuple[API, int]]:
    """parse instruction API features"""
    if insn.opcode not in (OpCodes.Call, OpCodes.Callvirt, OpCodes.Jmp, OpCodes.Calli):
        return
    name: str = get_imports(f.ctx).get(insn.operand.value, "")
    if not name:
        return
    if "::" in name:
        # like System.IO.File::OpenRead
        yield API(name), insn.offset
    else:
        # like kernel32.CreateFileA
        dll, _, symbol = name.rpartition(".")
        for name_variant in capa.features.extractors.helpers.generate_symbols(dll, symbol):
            yield API(name_variant), insn.offset
 def extract_insn_number_features(
    f: CilMethodBody, bb: CilMethodBody, insn: Instruction
 ) -> Iterator[Tuple[Number, int]]:
    """parse instruction number features"""
    if insn.is_ldc():
        yield Number(insn.get_ldc()), insn.offset
 def extract_insn_string_features(
    f: CilMethodBody, bb: CilMethodBody, insn: Instruction
 ) -> Iterator[Tuple[String, int]]:
    """parse instruction string features"""
    if not insn.is_ldstr():
        return
    if not isinstance(insn.operand, StringToken):
        return
    user_string: Optional[str] = read_dotnet_user_string(f.ctx["pe"], insn.operand)
    if user_string is None:
        return
    yield String(user_string), insn.offset
 def extract_features(f: CilMethodBody, bb: CilMethodBody, insn: Instruction) -> Iterator[Tuple[Feature, int]]:
    """extract instruction features"""
    for inst_handler in INSTRUCTION_HANDLERS:
        for (feature, offset) in inst_handler(f, bb, insn):
            yield feature, offset
 INSTRUCTION_HANDLERS = (
    extract_insn_api_features,
    extract_insn_number_features,
    extract_insn_string_features,
 )
--- a/capa/features/extractors/dotnetfile.py
+++ b/capa/features/extractors/dotnetfile.py
@@ -0,0 +1,129 @@
 import logging
 from typing import Tuple, Iterator
 from itertools import chain
 import dnfile
 import pefile
 import capa.features.extractors.helpers
 from capa.features.file import Import
 from capa.features.common import OS, OS_ANY, ARCH_ANY, ARCH_I386, ARCH_AMD64, FORMAT_DOTNET, Arch, Format, Feature
 from capa.features.extractors.base_extractor import FeatureExtractor
 from capa.features.extractors.dnfile.helpers import get_dotnet_managed_imports, get_dotnet_unmanaged_imports
 logger = logging.getLogger(__name__)
 def extract_file_format(**kwargs) -> Iterator[Tuple[Format, int]]:
    yield Format(FORMAT_DOTNET), 0x0
 def extract_file_import_names(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Import, int]]:
    for (token, imp) in chain(get_dotnet_managed_imports(pe), get_dotnet_unmanaged_imports(pe)):
        if "::" in imp:
            # like System.IO.File::OpenRead
            yield Import(imp), token
        else:
            # like kernel32.CreateFileA
            dll, _, symbol = imp.rpartition(".")
            for symbol_variant in capa.features.extractors.helpers.generate_symbols(dll, symbol):
                yield Import(symbol_variant), token
 def extract_file_os(**kwargs) -> Iterator[Tuple[OS, int]]:
    yield OS(OS_ANY), 0x0
 def extract_file_arch(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Arch, int]]:
    # to distinguish in more detail, see https://stackoverflow.com/a/23614024/10548020
    # .NET 4.5 added option: any CPU, 32-bit preferred
    if pe.net.Flags.CLR_32BITREQUIRED and pe.PE_TYPE == pefile.OPTIONAL_HEADER_MAGIC_PE:
        yield Arch(ARCH_I386), 0x0
    elif not pe.net.Flags.CLR_32BITREQUIRED and pe.PE_TYPE == pefile.OPTIONAL_HEADER_MAGIC_PE_PLUS:
        yield Arch(ARCH_AMD64), 0x0
    else:
        yield Arch(ARCH_ANY), 0x0
 def extract_file_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, int]]:
    for file_handler in FILE_HANDLERS:
        for feature, va in file_handler(pe=pe):  # type: ignore
            yield feature, va
 FILE_HANDLERS = (
    extract_file_import_names,
    # TODO extract_file_strings,
    # TODO extract_file_function_names,
    extract_file_format,
 )
 def extract_global_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, int]]:
    for handler in GLOBAL_HANDLERS:
        for feature, va in handler(pe=pe):  # type: ignore
            yield feature, va
 GLOBAL_HANDLERS = (
    extract_file_os,
    extract_file_arch,
 )
 class DotnetFileFeatureExtractor(FeatureExtractor):
    def __init__(self, path: str):
        super(DotnetFileFeatureExtractor, self).__init__()
        self.path: str = path
        self.pe: dnfile.dnPE = dnfile.dnPE(path)
    def get_base_address(self) -> int:
        return 0x0
    def get_entry_point(self) -> int:
        # self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT
        #  True: native EP: Token
        #  False: managed EP: RVA
        return self.pe.net.struct.EntryPointTokenOrRva
    def extract_global_features(self):
        yield from extract_global_features(self.pe)
    def extract_file_features(self):
        yield from extract_file_features(self.pe)
    def is_dotnet_file(self) -> bool:
        return bool(self.pe.net)
    def is_mixed_mode(self) -> bool:
        return not bool(self.pe.net.Flags.CLR_ILONLY)
    def get_runtime_version(self) -> Tuple[int, int]:
        return self.pe.net.struct.MajorRuntimeVersion, self.pe.net.struct.MinorRuntimeVersion
    def get_meta_version_string(self) -> str:
        return self.pe.net.metadata.struct.Version.rstrip(b"\x00").decode("utf-8")
    def get_functions(self):
        raise NotImplementedError("DotnetFileFeatureExtractor can only be used to extract file features")
    def extract_function_features(self, f):
        raise NotImplementedError("DotnetFileFeatureExtractor can only be used to extract file features")
    def get_basic_blocks(self, f):
        raise NotImplementedError("DotnetFileFeatureExtractor can only be used to extract file features")
    def extract_basic_block_features(self, f, bb):
        raise NotImplementedError("DotnetFileFeatureExtractor can only be used to extract file features")
    def get_instructions(self, f, bb):
        raise NotImplementedError("DotnetFileFeatureExtractor can only be used to extract file features")
    def extract_insn_features(self, f, bb, insn):
        raise NotImplementedError("DotnetFileFeatureExtractor can only be used to extract file features")
    def is_library_function(self, va):
        raise NotImplementedError("DotnetFileFeatureExtractor can only be used to extract file features")
    def get_function_name(self, va):
        raise NotImplementedError("DotnetFileFeatureExtractor can only be used to extract file features")
--- a/capa/features/extractors/helpers.py
+++ b/capa/features/extractors/helpers.py
@@ -51,6 +51,9 @@ def generate_symbols(dll: str, symbol: str) -> Iterator[str]:
      - CreateFileA
      - CreateFile
    """
    # normalize dll name
    dll = dll.lower()
    # kernel32.CreateFileA
    yield "%s.%s" % (dll, symbol)
--- a/capa/features/freeze.py
+++ b/capa/features/freeze.py
@@ -53,12 +53,12 @@ import zlib
 import logging
 from typing import Dict, Type
 import capa.helpers
 import capa.features.file
 import capa.features.insn
 import capa.features.common
 import capa.features.basicblock
 import capa.features.extractors.base_extractor
 from capa.helpers import hex
 from capa.features.common import Feature
 logger = logging.getLogger(__name__)
@@ -87,6 +87,7 @@ def dumps(extractor):
    returns:
      str: the serialized features.
    """
    hex = capa.helpers.hex
    ret = {
        "version": 1,
        "base address": extractor.get_base_address(),
--- a/capa/features/insn.py
+++ b/capa/features/insn.py
@@ -13,11 +13,6 @@ from capa.features.common import Feature
 class API(Feature):
    def __init__(self, name: str, description=None):
        # Downcase library name if given
        if "." in name:
            modname, _, impname = name.rpartition(".")
            name = modname.lower() + "." + impname
        super(API, self).__init__(name, description=description)
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -5,10 +5,20 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import os
 import logging
 from typing import NoReturn
 from capa.exceptions import UnsupportedFormatError
 from capa.features.common import FORMAT_SC32, FORMAT_SC64, FORMAT_UNKNOWN
 from capa.features.extractors.common import extract_format
 EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32")
 EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64")
 logger = logging.getLogger("capa")
 _hex = hex
@@ -35,3 +45,72 @@ def is_runtime_ida():
 def assert_never(value: NoReturn) -> NoReturn:
    assert False, f"Unhandled value: {value} ({type(value).__name__})"
 def get_format_from_extension(sample: str) -> str:
    if sample.endswith(EXTENSIONS_SHELLCODE_32):
        return FORMAT_SC32
    elif sample.endswith(EXTENSIONS_SHELLCODE_64):
        return FORMAT_SC64
    return FORMAT_UNKNOWN
 def get_auto_format(path: str) -> str:
    format_ = get_format(path)
    if format_ == FORMAT_UNKNOWN:
        format_ = get_format_from_extension(path)
    if format_ == FORMAT_UNKNOWN:
        raise UnsupportedFormatError()
    return format_
 def get_format(sample: str) -> str:
    with open(sample, "rb") as f:
        buf = f.read()
    for feature, _ in extract_format(buf):
        assert isinstance(feature.value, str)
        return feature.value
    return FORMAT_UNKNOWN
 def log_unsupported_format_error():
    logger.error("-" * 80)
    logger.error(" Input file does not appear to be a PE or ELF file.")
    logger.error(" ")
    logger.error(
        " capa currently only supports analyzing PE and ELF files (or shellcode, when using --format sc32|sc64)."
    )
    logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.")
    logger.error("-" * 80)
 def log_unsupported_os_error():
    logger.error("-" * 80)
    logger.error(" Input file does not appear to target a supported OS.")
    logger.error(" ")
    logger.error(
        " capa currently only supports analyzing executables for some operating systems (including Windows and Linux)."
    )
    logger.error("-" * 80)
 def log_unsupported_arch_error():
    logger.error("-" * 80)
    logger.error(" Input file does not appear to target a supported architecture.")
    logger.error(" ")
    logger.error(" capa currently only supports analyzing x86 (32- and 64-bit).")
    logger.error("-" * 80)
 def log_unsupported_runtime_error():
    logger.error("-" * 80)
    logger.error(" Unsupported runtime or Python interpreter.")
    logger.error(" ")
    logger.error(" capa supports running under Python 3.7 and higher.")
    logger.error(" ")
    logger.error(
        " If you're seeing this message on the command line, please ensure you're running a supported Python version."
    )
    logger.error("-" * 80)
--- a/capa/main.py
+++ b/capa/main.py
@@ -42,17 +42,34 @@ import capa.features.extractors
 import capa.features.extractors.common
 import capa.features.extractors.pefile
 import capa.features.extractors.elffile
 import capa.features.extractors.dotnetfile
 from capa.rules import Rule, Scope, RuleSet
 from capa.engine import FeatureSet, MatchResults
-from capa.helpers import get_file_taste
+from capa.helpers import (
    get_format,
    get_file_taste,
    get_auto_format,
    log_unsupported_os_error,
    log_unsupported_arch_error,
    log_unsupported_format_error,
 )
 from capa.exceptions import UnsupportedOSError, UnsupportedArchError, UnsupportedFormatError, UnsupportedRuntimeError
 from capa.features.common import (
    FORMAT_PE,
    FORMAT_ELF,
    FORMAT_AUTO,
    FORMAT_SC32,
    FORMAT_SC64,
    FORMAT_DOTNET,
    FORMAT_FREEZE,
 )
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor
 RULES_PATH_DEFAULT_STRING = "(embedded rules)"
 SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)"
 BACKEND_VIV = "vivisect"
 BACKEND_SMDA = "smda"
-EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32")
+BACKEND_DOTNET = "dotnet"
 EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64")
 E_MISSING_RULES = -10
 E_MISSING_FILE = -11
@@ -287,6 +304,7 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro
    return matches, meta
 # TODO move all to helpers?
 def has_rule_with_namespace(rules, capabilities, rule_cat):
    for rule_name in capabilities.keys():
        if rules.rules[rule_name].meta.get("namespace", "").startswith(rule_cat):
@@ -334,17 +352,6 @@ def is_supported_format(sample: str) -> bool:
    return len(list(capa.features.extractors.common.extract_format(taste))) == 1
 def get_format(sample: str) -> str:
    with open(sample, "rb") as f:
        buf = f.read()
    for feature, _ in capa.features.extractors.common.extract_format(buf):
        assert isinstance(feature.value, str)
        return feature.value
    return "unknown"
 def is_supported_arch(sample: str) -> bool:
    with open(sample, "rb") as f:
        buf = f.read()
@@ -433,19 +440,7 @@ def get_default_signatures() -> List[str]:
    return ret
-class UnsupportedFormatError(ValueError):
+def get_workspace(path, format_, sigpaths):
    pass
 class UnsupportedArchError(ValueError):
    pass
 class UnsupportedOSError(ValueError):
    pass
 def get_workspace(path, format, sigpaths):
    """
    load the program at the given path into a vivisect workspace using the given format.
    also apply the given FLIRT signatures.
@@ -465,21 +460,22 @@ def get_workspace(path, format, sigpaths):
    import viv_utils
    logger.debug("generating vivisect workspace for: %s", path)
-    if format == "auto":
+    # TODO should not be auto at this point, anymore
    if format_ == FORMAT_AUTO:
        if not is_supported_format(path):
            raise UnsupportedFormatError()
        # don't analyze, so that we can add our Flirt function analyzer first.
        vw = viv_utils.getWorkspace(path, analyze=False, should_save=False)
-    elif format in {"pe", "elf"}:
+    elif format_ in {FORMAT_PE, FORMAT_ELF}:
        vw = viv_utils.getWorkspace(path, analyze=False, should_save=False)
-    elif format == "sc32":
+    elif format_ == FORMAT_SC32:
        # these are not analyzed nor saved.
        vw = viv_utils.getShellcodeWorkspaceFromFile(path, arch="i386", analyze=False)
-    elif format == "sc64":
+    elif format_ == FORMAT_SC64:
        vw = viv_utils.getShellcodeWorkspaceFromFile(path, arch="amd64", analyze=False)
    else:
-        raise ValueError("unexpected format: " + format)
+        raise ValueError("unexpected format: " + format_)
    viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths)
@@ -489,12 +485,9 @@ def get_workspace(path, format, sigpaths):
    return vw
-class UnsupportedRuntimeError(RuntimeError):
+# TODO get_extractors -> List[FeatureExtractor]?
    pass
 def get_extractor(
-    path: str, format: str, backend: str, sigpaths: List[str], should_save_workspace=False, disable_progress=False
+    path: str, format_: str, backend: str, sigpaths: List[str], should_save_workspace=False, disable_progress=False
 ) -> FeatureExtractor:
    """
    raises:
@@ -502,7 +495,7 @@ def get_extractor(
      UnsupportedArchError
      UnsupportedOSError
    """
-    if format not in ("sc32", "sc64"):
+    if format_ not in (FORMAT_SC32, FORMAT_SC64):
        if not is_supported_format(path):
            raise UnsupportedFormatError()
@@ -512,6 +505,11 @@ def get_extractor(
        if not is_supported_os(path):
            raise UnsupportedOSError()
    if format_ == FORMAT_DOTNET:
        import capa.features.extractors.dnfile.extractor
        return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path)
    if backend == "smda":
        from smda.SmdaConfig import SmdaConfig
        from smda.Disassembler import Disassembler
@@ -530,7 +528,7 @@ def get_extractor(
        import capa.features.extractors.viv.extractor
        with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
-            vw = get_workspace(path, format, sigpaths)
+            vw = get_workspace(path, format_, sigpaths)
            if should_save_workspace:
                logger.debug("saving workspace")
@@ -545,6 +543,22 @@ def get_extractor(
        return capa.features.extractors.viv.extractor.VivisectFeatureExtractor(vw, path)
 def get_file_extractors(sample: str, format_: str) -> List[FeatureExtractor]:
    file_extractors: List[FeatureExtractor] = list()
    if format_ == capa.features.extractors.common.FORMAT_PE:
        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(sample))
        dotnetfile_extractor = capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(sample)
        if dotnetfile_extractor.is_dotnet_file():
            file_extractors.append(dotnetfile_extractor)
    elif format_ == capa.features.extractors.common.FORMAT_ELF:
        file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(sample))
    return file_extractors
 def is_nursery_rule_path(path: str) -> bool:
    """
    The nursery is a spot for rules that have not yet been fully polished.
@@ -653,7 +667,7 @@ def collect_metadata(argv, sample_path, rules_path, extractor):
    if rules_path != [RULES_PATH_DEFAULT_STRING]:
        rules_path = [os.path.abspath(os.path.normpath(r)) for r in rules_path]
-    format = get_format(sample_path)
+    format_ = get_format(sample_path)
    arch = get_arch(sample_path)
    os_ = get_os(sample_path)
@@ -668,7 +682,7 @@ def collect_metadata(argv, sample_path, rules_path, extractor):
            "path": os.path.normpath(sample_path),
        },
        "analysis": {
-            "format": format,
+            "format": format_,
            "arch": arch,
            "os": os_,
            "extractor": extractor.__class__.__name__,
@@ -783,19 +797,20 @@ def install_common_args(parser, wanted=None):
    if "format" in wanted:
        formats = [
-            ("auto", "(default) detect file type automatically"),
+            (FORMAT_AUTO, "(default) detect file type automatically"),
-            ("pe", "Windows PE file"),
+            (FORMAT_PE, "Windows PE file"),
-            ("elf", "Executable and Linkable Format"),
+            (FORMAT_DOTNET, ".NET PE file"),
-            ("sc32", "32-bit shellcode"),
+            (FORMAT_ELF, "Executable and Linkable Format"),
-            ("sc64", "64-bit shellcode"),
+            (FORMAT_SC32, "32-bit shellcode"),
-            ("freeze", "features previously frozen by capa"),
+            (FORMAT_SC64, "64-bit shellcode"),
            (FORMAT_FREEZE, "features previously frozen by capa"),
        ]
        format_help = ", ".join(["%s: %s" % (f[0], f[1]) for f in formats])
        parser.add_argument(
            "-f",
            "--format",
            choices=[f[0] for f in formats],
-            default="auto",
+            default=FORMAT_AUTO,
            help="select sample format, %s" % format_help,
        )
@@ -974,13 +989,21 @@ def main(argv=None):
        return ret
    try:
-        taste = get_file_taste(args.sample)
+        _ = get_file_taste(args.sample)
    except IOError as e:
        # per our research there's not a programmatic way to render the IOError with non-ASCII filename unless we
        # handle the IOError separately and reach into the args
        logger.error("%s", e.args[0])
        return E_MISSING_FILE
    format_ = args.format
    if format_ == FORMAT_AUTO:
        try:
            format_ = get_auto_format(args.sample)
        except UnsupportedFormatError:
            log_unsupported_format_error()
            return E_INVALID_FILE_TYPE
    try:
        rules = get_rules(args.rules, disable_progress=args.quiet)
        rules = capa.rules.RuleSet(rules)
@@ -1002,26 +1025,23 @@ def main(argv=None):
        logger.error("%s", str(e))
        return E_INVALID_RULE
-    file_extractor = None
+    # file feature extractors are pretty lightweight: they don't do any code analysis.
-    if args.format == "pe" or (args.format == "auto" and taste.startswith(b"MZ")):
+    # so we can fairly quickly determine if the given file has "pure" file-scope rules
-        # these pefile and elffile file feature extractors are pretty light weight: they don't do any code analysis.
+    # that indicate a limitation (like "file is packed based on section names")
-        # so we can fairly quickly determine if the given file has "pure" file-scope rules
+    # and avoid doing a full code analysis on difficult/impossible binaries.
-        # that indicate a limitation (like "file is packed based on section names")
+    #
-        # and avoid doing a full code analysis on difficult/impossible binaries.
+    # this pass can inspect multiple file extractors, e.g., dotnet and pe to identify
-        try:
+    # various limitations
-            file_extractor = capa.features.extractors.pefile.PefileFeatureExtractor(args.sample)
+    try:
-        except PEFormatError as e:
+        file_extractors = get_file_extractors(args.sample, format_)
-            logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e))
+    except PEFormatError as e:
-            return E_CORRUPT_FILE
+        logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e))
        return E_CORRUPT_FILE
    except (ELFError, OverflowError) as e:
        logger.error("Input file '%s' is not a valid ELF file: %s", args.sample, str(e))
        return E_CORRUPT_FILE
-    elif args.format == "elf" or (args.format == "auto" and taste.startswith(b"\x7fELF")):
+    for file_extractor in file_extractors:
        try:
            file_extractor = capa.features.extractors.elffile.ElfFeatureExtractor(args.sample)
        except (ELFError, OverflowError) as e:
            logger.error("Input file '%s' is not a valid ELF file: %s", args.sample, str(e))
            return E_CORRUPT_FILE
    if file_extractor:
        try:
            pure_file_capabilities, _ = find_file_capabilities(rules, file_extractor, {})
        except PEFormatError as e:
@@ -1040,58 +1060,37 @@ def main(argv=None):
                logger.debug("file limitation short circuit, won't analyze fully.")
                return E_FILE_LIMITATION
-    try:
+        if isinstance(file_extractor, capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor):
-        if args.format == "pe" or (args.format == "auto" and taste.startswith(b"MZ")):
+            format_ = FORMAT_DOTNET
            sig_paths = get_signatures(args.signatures)
        else:
            sig_paths = []
            logger.debug("skipping library code matching: only have PE signatures")
    except (IOError) as e:
        logger.error("%s", str(e))
        return E_INVALID_SIG
-    if (args.format == "freeze") or (args.format == "auto" and capa.features.freeze.is_freeze(taste)):
+    if format_ == FORMAT_FREEZE:
        format = "freeze"
        with open(args.sample, "rb") as f:
            extractor = capa.features.freeze.load(f.read())
    else:
-        format = args.format
+        try:
-        if format == "auto" and args.sample.endswith(EXTENSIONS_SHELLCODE_32):
+            if format_ == FORMAT_PE:
-            format = "sc32"
+                sig_paths = get_signatures(args.signatures)
-        elif format == "auto" and args.sample.endswith(EXTENSIONS_SHELLCODE_64):
+            else:
-            format = "sc64"
+                sig_paths = []
                logger.debug("skipping library code matching: only have native PE signatures")
        except IOError as e:
            logger.error("%s", str(e))
            return E_INVALID_SIG
        should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
        try:
            extractor = get_extractor(
-                args.sample, format, args.backend, sig_paths, should_save_workspace, disable_progress=args.quiet
+                args.sample, format_, args.backend, sig_paths, should_save_workspace, disable_progress=args.quiet
            )
        except UnsupportedFormatError:
-            logger.error("-" * 80)
+            log_unsupported_format_error()
            logger.error(" Input file does not appear to be a PE or ELF file.")
            logger.error(" ")
            logger.error(
                " capa currently only supports analyzing PE and ELF files (or shellcode, when using --format sc32|sc64)."
            )
            logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.")
            logger.error("-" * 80)
            return E_INVALID_FILE_TYPE
        except UnsupportedArchError:
-            logger.error("-" * 80)
+            log_unsupported_arch_error()
            logger.error(" Input file does not appear to target a supported architecture.")
            logger.error(" ")
            logger.error(" capa currently only supports analyzing x86 (32- and 64-bit).")
            logger.error("-" * 80)
            return E_INVALID_FILE_ARCH
        except UnsupportedOSError:
-            logger.error("-" * 80)
+            log_unsupported_os_error()
            logger.error(" Input file does not appear to target a supported OS.")
            logger.error(" ")
            logger.error(
                " capa currently only supports analyzing executables for some operating systems (including Windows and Linux)."
            )
            logger.error("-" * 80)
            return E_INVALID_FILE_OS
    meta = collect_metadata(argv, args.sample, args.rules, extractor)
--- a/capa/render/json.py
+++ b/capa/render/json.py
@@ -7,9 +7,9 @@
 # See the License for the specific language governing permissions and limitations under the License.
 import json
 import capa.render.result_document
 from capa.rules import RuleSet
 from capa.engine import MatchResults
 from capa.render.result_document import convert_capabilities_to_result_document
 class CapaJsonObjectEncoder(json.JSONEncoder):
@@ -27,7 +27,7 @@ class CapaJsonObjectEncoder(json.JSONEncoder):
 def render(meta, rules: RuleSet, capabilities: MatchResults) -> str:
    return json.dumps(
-        capa.render.result_document.convert_capabilities_to_result_document(meta, rules, capabilities),
+        convert_capabilities_to_result_document(meta, rules, capabilities),
        cls=CapaJsonObjectEncoder,
        sort_keys=True,
    )
--- a/capa/render/result_document.py
+++ b/capa/render/result_document.py
@@ -7,7 +7,6 @@
 # See the License for the specific language governing permissions and limitations under the License.
 import copy
 import capa.rules
 import capa.engine
 import capa.render.utils
 import capa.features.common
--- a/scripts/lint.py
+++ b/scripts/lint.py
@@ -41,6 +41,7 @@ import tqdm.contrib.logging
 import capa.main
 import capa.rules
 import capa.engine
 import capa.helpers
 import capa.features.insn
 import capa.features.common
 from capa.rules import Rule, RuleSet
@@ -286,16 +287,16 @@ def get_sample_capabilities(ctx: Context, path: Path) -> Set[str]:
        logger.debug("found cached results: %s: %d capabilities", nice_path, len(ctx.capabilities_by_sample[path]))
        return ctx.capabilities_by_sample[path]
-    if nice_path.endswith(capa.main.EXTENSIONS_SHELLCODE_32):
+    if nice_path.endswith(capa.helpers.EXTENSIONS_SHELLCODE_32):
-        format = "sc32"
+        format_ = "sc32"
-    elif nice_path.endswith(capa.main.EXTENSIONS_SHELLCODE_64):
+    elif nice_path.endswith(capa.helpers.EXTENSIONS_SHELLCODE_64):
-        format = "sc64"
+        format_ = "sc64"
    else:
-        format = "auto"
+        format_ = "auto"
    logger.debug("analyzing sample: %s", nice_path)
    extractor = capa.main.get_extractor(
-        nice_path, format, capa.main.BACKEND_VIV, DEFAULT_SIGNATURES, False, disable_progress=True
+        nice_path, format_, capa.main.BACKEND_VIV, DEFAULT_SIGNATURES, False, disable_progress=True
    )
    capabilities, _ = capa.main.find_capabilities(ctx.rules, extractor, disable_progress=True)
--- a/scripts/show-capabilities-by-function.py
+++ b/scripts/show-capabilities-by-function.py
@@ -59,7 +59,9 @@ import colorama
 import capa.main
 import capa.rules
 import capa.engine
 import capa.helpers
 import capa.features
 import capa.exceptions
 import capa.render.utils as rutils
 import capa.features.freeze
 import capa.render.result_document
@@ -162,25 +164,11 @@ def main(argv=None):
            extractor = capa.main.get_extractor(
                args.sample, args.format, args.backend, sig_paths, should_save_workspace
            )
-        except capa.main.UnsupportedFormatError:
+        except capa.exceptions.UnsupportedFormatError:
-            logger.error("-" * 80)
+            capa.helpers.log_unsupported_format_error()
            logger.error(" Input file does not appear to be a PE file.")
            logger.error(" ")
            logger.error(
                " capa currently only supports analyzing PE files (or shellcode, when using --format sc32|sc64)."
            )
            logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.")
            logger.error("-" * 80)
            return -1
-        except capa.main.UnsupportedRuntimeError:
+        except capa.exceptions.UnsupportedRuntimeError:
-            logger.error("-" * 80)
+            capa.helpers.log_unsupported_runtime_error()
            logger.error(" Unsupported runtime or Python interpreter.")
            logger.error(" ")
            logger.error(" capa supports running under Python 2.7 using Vivisect for binary analysis.")
            logger.error(" It can also run within IDA Pro, using either Python 2.7 or 3.5+.")
            logger.error(" ")
            logger.error(" If you're seeing this message on the command line, please ensure you're running Python 2.7.")
            logger.error("-" * 80)
            return -1
    meta = capa.main.collect_metadata(argv, args.sample, args.rules, extractor)
--- a/scripts/show-features.py
+++ b/scripts/show-features.py
@@ -75,8 +75,10 @@ import capa.rules
 import capa.engine
 import capa.helpers
 import capa.features
 import capa.exceptions
 import capa.features.common
 import capa.features.freeze
 from capa.helpers import log_unsupported_runtime_error
 logger = logging.getLogger("capa.show-features")
@@ -113,27 +115,19 @@ def main(argv=None):
            extractor = capa.main.get_extractor(
                args.sample, args.format, args.backend, sig_paths, should_save_workspace
            )
-        except capa.main.UnsupportedFormatError:
+        except capa.exceptions.UnsupportedFormatError:
-            logger.error("-" * 80)
+            capa.helpers.log_unsupported_format_error()
            logger.error(" Input file does not appear to be a PE file.")
            logger.error(" ")
            logger.error(
                " capa currently only supports analyzing PE files (or shellcode, when using --format sc32|sc64)."
            )
            logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.")
            logger.error("-" * 80)
            return -1
-        except capa.main.UnsupportedRuntimeError:
+        except capa.exceptions.UnsupportedRuntimeError:
-            logger.error("-" * 80)
+            log_unsupported_runtime_error()
            logger.error(" Unsupported runtime or Python interpreter.")
            logger.error(" ")
            logger.error(" capa supports running under Python 2.7 using Vivisect for binary analysis.")
            logger.error(" It can also run within IDA Pro, using either Python 2.7 or 3.5+.")
            logger.error(" ")
            logger.error(" If you're seeing this message on the command line, please ensure you're running Python 2.7.")
            logger.error("-" * 80)
            return -1
    for feature, va in extractor.extract_global_features():
        if va:
            print("global: 0x%08x: %s" % (va, feature))
        else:
            print("global: 0x00000000: %s" % (feature))
    if not args.function:
        for feature, va in extractor.extract_file_features():
            if va:
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,8 @@ requirements = [
    "smda==1.7.1",
    "pefile==2021.9.3",
    "pyelftools==0.28",
    "dnfile==0.10.0",
    "dncil==1.0.0",
 ]
 # this sets __version__
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -22,9 +22,23 @@ import capa.features.file
 import capa.features.insn
 import capa.features.common
 import capa.features.basicblock
-from capa.features.common import OS, OS_LINUX, ARCH_I386, FORMAT_PE, ARCH_AMD64, FORMAT_ELF, OS_WINDOWS, Arch, Format
+from capa.features.common import (
    OS,
    OS_ANY,
    OS_LINUX,
    ARCH_I386,
    FORMAT_PE,
    ARCH_AMD64,
    FORMAT_ELF,
    OS_WINDOWS,
    FORMAT_DOTNET,
    Arch,
    Format,
 )
 CD = os.path.dirname(__file__)
 DOTNET_DIR = os.path.join(CD, "data", "dotnet")
 DNFILE_TESTFILES = os.path.join(DOTNET_DIR, "dnfile-testfiles")
@contextlib.contextmanager
@@ -122,6 +136,19 @@ def get_pefile_extractor(path):
    return capa.features.extractors.pefile.PefileFeatureExtractor(path)
 def get_dotnetfile_extractor(path):
    import capa.features.extractors.dotnetfile
    return capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(path)
@lru_cache(maxsize=1)
 def get_dnfile_extractor(path):
    import capa.features.extractors.dnfile.extractor
    return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path)
 def extract_global_features(extractor):
    features = collections.defaultdict(set)
    for feature, va in extractor.extract_global_features():
@@ -220,6 +247,14 @@ def get_data_path_by_name(name):
        return os.path.join(CD, "data", "79abd17391adc6251ecdc58d13d76baf.dll_")
    elif name.startswith("946a9"):
        return os.path.join(CD, "data", "946a99f36a46d335dec080d9a4371940.dll_")
    elif name.startswith("b9f5b"):
        return os.path.join(CD, "data", "b9f5bd514485fb06da39beff051b9fdc.exe_")
    elif name.startswith("mixed-mode-64"):
        return os.path.join(DNFILE_TESTFILES, "mixed-mode", "ModuleCode", "bin", "ModuleCode_amd64.exe")
    elif name.startswith("hello-world"):
        return os.path.join(DNFILE_TESTFILES, "hello-world", "hello-world.exe")
    elif name.startswith("_1c444"):
        return os.path.join(CD, "data", "dotnet", "1c444ebeba24dcba8628b7dfe5fec7c6.exe_")
    else:
        raise ValueError("unexpected sample fixture: %s" % name)
@@ -272,7 +307,9 @@ def get_sample_md5_by_name(name):
    elif name.startswith("79abd"):
        return "79abd17391adc6251ecdc58d13d76baf"
    elif name.startswith("946a9"):
-        return "946a99f36a46d335dec080d9a4371940.dll_"
+        return "946a99f36a46d335dec080d9a4371940"
    elif name.startswith("b9f5b"):
        return "b9f5bd514485fb06da39beff051b9fdc"
    else:
        raise ValueError("unexpected sample fixture: %s" % name)
@@ -626,6 +663,39 @@ FEATURE_PRESENCE_TESTS = sorted(
    key=lambda t: (t[0], t[1]),
 )
 FEATURE_PRESENCE_TESTS_DOTNET = sorted(
    [
        ("b9f5b", "file", Arch(ARCH_I386), True),
        ("b9f5b", "file", Arch(ARCH_AMD64), False),
        ("mixed-mode-64", "file", Arch(ARCH_AMD64), True),
        ("mixed-mode-64", "file", Arch(ARCH_I386), False),
        ("b9f5b", "file", OS(OS_ANY), True),
        ("b9f5b", "file", Format(FORMAT_DOTNET), True),
        ("hello-world", "function=0x250", capa.features.common.String("Hello World!"), True),
        ("hello-world", "function=0x250, bb=0x250, insn=0x252", capa.features.common.String("Hello World!"), True),
        ("hello-world", "function=0x250", capa.features.insn.API("System.Console::WriteLine"), True),
        ("hello-world", "file", capa.features.file.Import("System.Console::WriteLine"), True),
        ("_1c444", "file", capa.features.file.Import("gdi32.CreateCompatibleBitmap"), True),
        ("_1c444", "file", capa.features.file.Import("CreateCompatibleBitmap"), True),
        ("_1c444", "file", capa.features.file.Import("gdi32::CreateCompatibleBitmap"), False),
        ("_1c444", "function=0x1F68", capa.features.insn.API("GetWindowDC"), True),
        ("_1c444", "function=0x1F68", capa.features.insn.API("user32.GetWindowDC"), True),
        ("_1c444", "function=0x1F68", capa.features.insn.Number(0xCC0020), True),
        ("_1c444", "function=0x1F68", capa.features.insn.Number(0x0), True),
        ("_1c444", "function=0x1F68", capa.features.insn.Number(0x1), False),
        (
            "_1c444",
            "function=0x1F68, bb=0x1F68, insn=0x1FF9",
            capa.features.insn.API("System.Drawing.Image::FromHbitmap"),
            True,
        ),
        ("_1c444", "function=0x1F68, bb=0x1F68, insn=0x1FF9", capa.features.insn.API("FromHbitmap"), False),
    ],
    # order tests by (file, item)
    # so that our LRU cache is most effective.
    key=lambda t: (t[0], t[1]),
 )
 FEATURE_PRESENCE_TESTS_IDA = [
    # file/imports
    # IDA can recover more names of APIs imported by ordinal
@@ -641,6 +711,9 @@ FEATURE_COUNT_TESTS = [
 ]
 FEATURE_COUNT_TESTS_DOTNET = []  # type: ignore
 def do_test_feature_presence(get_extractor, sample, scope, feature, expected):
    extractor = get_extractor(sample)
    features = scope(extractor)
@@ -738,3 +811,23 @@ def al_khaser_x86_extractor():
@pytest.fixture
 def pingtaest_extractor():
    return get_extractor(get_data_path_by_name("pingtaest"))
@pytest.fixture
 def b9f5b_dotnetfile_extractor():
    return get_dotnetfile_extractor(get_data_path_by_name("b9f5b"))
@pytest.fixture
 def mixed_mode_64_dotnetfile_extractor():
    return get_dotnetfile_extractor(get_data_path_by_name("mixed-mode-64"))
@pytest.fixture
 def hello_world_dnfile_extractor():
    return get_dnfile_extractor(get_data_path_by_name("hello-world"))
@pytest.fixture
 def _1c444_dnfile_extractor():
    return get_dnfile_extractor(get_data_path_by_name("1c444..."))
--- a/tests/test_dnfile_features.py
+++ b/tests/test_dnfile_features.py
@@ -0,0 +1,30 @@
 # Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import pytest
 import fixtures
 from fixtures import *
 from fixtures import parametrize
@parametrize(
    "sample,scope,feature,expected",
    fixtures.FEATURE_PRESENCE_TESTS_DOTNET,
    indirect=["sample", "scope"],
 )
 def test_dnfile_features(sample, scope, feature, expected):
    fixtures.do_test_feature_presence(fixtures.get_dnfile_extractor, sample, scope, feature, expected)
@parametrize(
    "sample,scope,feature,expected",
    fixtures.FEATURE_COUNT_TESTS_DOTNET,
    indirect=["sample", "scope"],
 )
 def test_dnfile_feature_counts(sample, scope, feature, expected):
    fixtures.do_test_feature_count(fixtures.get_dnfile_extractor, sample, scope, feature, expected)
--- a/tests/test_dotnetfile_features.py
+++ b/tests/test_dotnetfile_features.py
@@ -0,0 +1,43 @@
 # Copyright (C) 2020 Mandiant, Inc. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at: [package root]/LICENSE.txt
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import pytest
 import fixtures
 from fixtures import *
 from fixtures import parametrize
@parametrize(
    "sample,scope,feature,expected",
    fixtures.FEATURE_PRESENCE_TESTS_DOTNET,
    indirect=["sample", "scope"],
 )
 def test_dotnetfile_features(sample, scope, feature, expected):
    if scope.__name__ != "file":
        pytest.xfail("dotnetfile only extracts file scope features")
    if isinstance(feature, capa.features.file.FunctionName):
        pytest.xfail("dotnetfile doesn't extract function names")
    fixtures.do_test_feature_presence(fixtures.get_dotnetfile_extractor, sample, scope, feature, expected)
@parametrize(
    "extractor,function,expected",
    [
        ("b9f5b_dotnetfile_extractor", "is_dotnet_file", True),
        ("b9f5b_dotnetfile_extractor", "is_mixed_mode", False),
        ("mixed_mode_64_dotnetfile_extractor", "is_mixed_mode", True),
        ("b9f5b_dotnetfile_extractor", "get_entry_point", 0x6000007),
        ("b9f5b_dotnetfile_extractor", "get_runtime_version", (2, 5)),
        ("b9f5b_dotnetfile_extractor", "get_meta_version_string", "v2.0.50727"),
    ],
 )
 def test_dotnetfile_extractor(request, extractor, function, expected):
    extractor_function = getattr(request.getfixturevalue(extractor), function)
    assert extractor_function() == expected