diff --git a/.github/mypy/mypy.ini b/.github/mypy/mypy.ini index d3eeee93..1a92b86d 100644 --- a/.github/mypy/mypy.ini +++ b/.github/mypy/mypy.ini @@ -60,6 +60,9 @@ ignore_missing_imports = True [mypy-ida_loader.*] ignore_missing_imports = True +[mypy-ida_segment.*] +ignore_missing_imports = True + [mypy-PyQt5.*] ignore_missing_imports = True diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index df80d708..52be6841 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -28,12 +28,12 @@ jobs: asset_name: macos steps: - name: Checkout capa - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: submodules: true # using Python 3.8 to support running across multiple operating systems including Windows 7 - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: 3.8 - if: matrix.os == 'ubuntu-18.04' @@ -50,7 +50,7 @@ jobs: run: dist/capa "tests/data/499c2a85f6e8142c3f48d4251c9c7cd6.raw32" - name: Does it run (ELF)? run: dist/capa "tests/data/7351f8a40c5450557b24622417fc478d.elf_" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 with: name: ${{ matrix.asset_name }} path: dist/${{ matrix.artifact_name }} @@ -74,7 +74,7 @@ jobs: asset_name: windows steps: - name: Download ${{ matrix.asset_name }} - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v3 with: name: ${{ matrix.asset_name }} - name: Set executable flag @@ -100,7 +100,7 @@ jobs: artifact_name: capa steps: - name: Download ${{ matrix.asset_name }} - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v3 with: name: ${{ matrix.asset_name }} - name: Set executable flag diff --git a/.github/workflows/changelog.yml b/.github/workflows/changelog.yml index c028ce82..b68845f7 100644 --- a/.github/workflows/changelog.yml +++ b/.github/workflows/changelog.yml @@ -17,7 +17,7 @@ jobs: steps: - name: Get changed files id: files - uses: Ana06/get-changed-files@v1.2 + uses: Ana06/get-changed-files@v2.2.0 - name: check changelog updated id: changelog_updated env: diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 42525df9..65278522 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -11,9 +11,9 @@ jobs: deploy: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: '3.7' - name: Install dependencies diff --git a/.github/workflows/tag.yml b/.github/workflows/tag.yml index bed2512d..744ea207 100644 --- a/.github/workflows/tag.yml +++ b/.github/workflows/tag.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout capa-rules - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: repository: mandiant/capa-rules token: ${{ secrets.CAPA_TOKEN }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ed866547..6678e0aa 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout capa - uses: actions/checkout@v2 + uses: actions/checkout@v3 # The sync GH action in capa-rules relies on a single '- *$' in the CHANGELOG file - name: Ensure CHANGELOG has '- *$' run: | @@ -26,9 +26,9 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout capa - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: "3.8" - name: Install dependencies @@ -40,17 +40,17 @@ jobs: - name: Lint with pycodestyle run: pycodestyle --show-source capa/ scripts/ tests/ - name: Check types with mypy - run: mypy --config-file .github/mypy/mypy.ini capa/ scripts/ tests/ + run: mypy --config-file .github/mypy/mypy.ini --check-untyped-defs capa/ scripts/ tests/ rule_linter: runs-on: ubuntu-20.04 steps: - name: Checkout capa with submodules - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: submodules: recursive - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: "3.8" - name: Install capa @@ -67,7 +67,7 @@ jobs: matrix: os: [ubuntu-20.04, windows-2019, macos-11] # across all operating systems - python-version: ["3.7", "3.10"] + python-version: ["3.7", "3.11"] include: # on Ubuntu run these as well - os: ubuntu-20.04 @@ -76,11 +76,11 @@ jobs: python-version: "3.9" steps: - name: Checkout capa with submodules - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: submodules: recursive - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install pyyaml diff --git a/CHANGELOG.md b/CHANGELOG.md index ced8edf1..7903c420 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,14 +4,18 @@ ### New Features - verify rule metadata format on load #1160 @mr-tz -- extract property features from .NET PE files #1168 @anushkavirgaonkar -- emit features for .NET newobj instruction #1186 @mike-hunhoff -- fix import-to-ida script formatting #1208 @williballenthin +- dotnet: emit property features #1168 @anushkavirgaonkar +- dotnet: emit API features for objects created via the newobj instruction #1186 @mike-hunhoff +- dotnet: emit API features for generic methods #1231 @mike-hunhoff +- Python 3.11 support #1192 @williballenthin +- dotnet: emit calls to/from MethodDef methods #1236 @mike-hunhoff +- dotnet: emit namespace/class features for ldvirtftn/ldftn instructions #1241 @mike-hunhoff +- dotnet: emit namespace/class features for type references #1242 @mike-hunhoff ### Breaking Changes - remove SMDA backend #1062 @williballenthin -### New Rules (30) +### New Rules (43) - collection/use-dotnet-library-sharpclipboard @johnk3r - data-manipulation/encryption/aes/use-dotnet-library-encryptdecryptutils @johnk3r @@ -42,6 +46,19 @@ - nursery/execute-wmi-query-in-dotnet michael.hunhoff@mandiant.com - nursery/manipulate-network-credentials-in-dotnet michael.hunhoff@mandiant.com - nursery/encrypt-data-using-aes william.ballenthin@mandiant.com Ivan Kwiatkowski (@JusticeRage) +- host-interaction/uac/bypass/bypass-uac-via-rpc david.cannings@pwc.com david@edeca.net +- nursery/check-for-vm-using-instruction-vpcext richard.weiss@mandiant.com +- nursery/get-windows-directory-from-kuser_shared_data david.cannings@pwc.com +- nursery/encrypt-data-using-openssl-dsa Ana06 +- nursery/encrypt-data-using-openssl-ecdsa Ana06 +- nursery/encrypt-data-using-openssl-rsa Ana06 +- runtime/dotnet/execute-via-dotnet-startup-hook william.ballenthin@mandiant.com +- host-interaction/console/manipulate-console-buffer william.ballenthin@mandiant.com michael.hunhoff@mandiant.com +- nursery/access-wmi-data-in-dotnet michael.hunhoff@mandiant.com +- nursery/allocate-unmanaged-memory-via-dotnet michael.hunhoff@mandiant.com +- nursery/generate-random-bytes-in-dotnet michael.hunhoff@mandiant.com +- nursery/manipulate-console-window michael.hunhoff@mandiant.com +- nursery/obfuscated-with-koivm michael.hunhoff@mandiant.com - ### Bug Fixes @@ -49,9 +66,18 @@ - decouple Token dependency / extractor and features #1139 @mr-tz - update pydantic model to guarantee type coercion #1176 @mike-hunhoff - do not overwrite version in version.py during PyInstaller build #1169 @mr-tz +- render: fix vverbose rendering of offsets #1215 @williballenthin +- elf: better detect OS via GLIBC ABI version needed and dependencies #1221 @williballenthin +- dotnet: address unhandled exceptions with improved type checking #1230 @mike-hunhoff +- fix import-to-ida script formatting #1208 @williballenthin ### capa explorer IDA Pro plugin - fix: display instruction items #1154 @mr-tz +- fix: accept only plaintext pasted content #1194 @williballenthin +- fix: UnboundLocalError #1217 @williballenthin +- extractor: add support for COFF files and extern functions #1223 @mike-hunhoff +- doc: improve error messaging and documentation related to capa rule set #1249 @mike-hunhoff +- fix: assume 32-bit displacement for offsets #1250 @mike-hunhoff ### Development diff --git a/README.md b/README.md index f00c9411..ccdf0db9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/flare-capa)](https://pypi.org/project/flare-capa) [![Last release](https://img.shields.io/github/v/release/mandiant/capa)](https://github.com/mandiant/capa/releases) -[![Number of rules](https://img.shields.io/badge/rules-731-blue.svg)](https://github.com/mandiant/capa-rules) +[![Number of rules](https://img.shields.io/badge/rules-742-blue.svg)](https://github.com/mandiant/capa-rules) [![CI status](https://github.com/mandiant/capa/workflows/CI/badge.svg)](https://github.com/mandiant/capa/actions?query=workflow%3ACI+event%3Apush+branch%3Amaster) [![Downloads](https://img.shields.io/github/downloads/mandiant/capa/total)](https://github.com/mandiant/capa/releases) [![License](https://img.shields.io/badge/license-Apache--2.0-green.svg)](LICENSE.txt) diff --git a/capa/engine.py b/capa/engine.py index 0b45dc06..dde1e7c8 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -8,7 +8,7 @@ import copy import collections -from typing import TYPE_CHECKING, Set, Dict, List, Tuple, Mapping, Iterable +from typing import TYPE_CHECKING, Set, Dict, List, Tuple, Union, Mapping, Iterable, Iterator, cast import capa.perf import capa.features.common @@ -38,7 +38,7 @@ class Statement: """ def __init__(self, description=None): - super(Statement, self).__init__() + super().__init__() self.name = self.__class__.__name__ self.description = description @@ -60,17 +60,24 @@ class Statement: """ raise NotImplementedError() - def get_children(self): + def get_children(self) -> Iterator[Union["Statement", Feature]]: if hasattr(self, "child"): - yield self.child + # this really confuses mypy because the property may not exist + # since its defined in the subclasses. + child = self.child # type: ignore + assert isinstance(child, (Statement, Feature)) + yield child if hasattr(self, "children"): for child in getattr(self, "children"): + assert isinstance(child, (Statement, Feature)) yield child def replace_child(self, existing, new): if hasattr(self, "child"): - if self.child is existing: + # this really confuses mypy because the property may not exist + # since its defined in the subclasses. + if self.child is existing: # type: ignore self.child = new if hasattr(self, "children"): @@ -90,7 +97,7 @@ class And(Statement): """ def __init__(self, children, description=None): - super(And, self).__init__(description=description) + super().__init__(description=description) self.children = children def evaluate(self, ctx, short_circuit=True): @@ -123,7 +130,7 @@ class Or(Statement): """ def __init__(self, children, description=None): - super(Or, self).__init__(description=description) + super().__init__(description=description) self.children = children def evaluate(self, ctx, short_circuit=True): @@ -150,7 +157,7 @@ class Not(Statement): """match only if the child evaluates to False.""" def __init__(self, child, description=None): - super(Not, self).__init__(description=description) + super().__init__(description=description) self.child = child def evaluate(self, ctx, short_circuit=True): @@ -172,7 +179,7 @@ class Some(Statement): """ def __init__(self, count, children, description=None): - super(Some, self).__init__(description=description) + super().__init__(description=description) self.count = count self.children = children @@ -208,7 +215,7 @@ class Range(Statement): """match if the child is contained in the ctx set with a count in the given range.""" def __init__(self, child, min=None, max=None, description=None): - super(Range, self).__init__(description=description) + super().__init__(description=description) self.child = child self.min = min if min is not None else 0 self.max = max if max is not None else (1 << 64 - 1) @@ -237,7 +244,7 @@ class Subscope(Statement): """ def __init__(self, scope, child, description=None): - super(Subscope, self).__init__(description=description) + super().__init__(description=description) self.scope = scope self.child = child diff --git a/capa/features/basicblock.py b/capa/features/basicblock.py index a7a2d15c..09f1b26d 100644 --- a/capa/features/basicblock.py +++ b/capa/features/basicblock.py @@ -11,7 +11,7 @@ from capa.features.common import Feature class BasicBlock(Feature): def __init__(self, description=None): - super(BasicBlock, self).__init__(None, description=description) + super().__init__(0, description=description) def __str__(self): return "basic block" diff --git a/capa/features/common.py b/capa/features/common.py index 67c9ed0d..5d30f10b 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -9,9 +9,10 @@ import re import abc import codecs +import typing import logging import collections -from typing import TYPE_CHECKING, Set, Dict, List, Union, Optional, Sequence +from typing import TYPE_CHECKING, Set, Dict, List, Union, Optional if TYPE_CHECKING: # circular import, otherwise @@ -81,7 +82,7 @@ class Result: children: List["Result"], locations: Optional[Set[Address]] = None, ): - super(Result, self).__init__() + super().__init__() self.success = success self.statement = statement self.children = children @@ -110,7 +111,7 @@ class Feature(abc.ABC): value (any): the value of the feature, such as the number or string. description (str): a human-readable description that explains the feature value. """ - super(Feature, self).__init__() + super().__init__() self.name = self.__class__.__name__.lower() self.value = value @@ -165,33 +166,33 @@ class Feature(abc.ABC): class MatchedRule(Feature): def __init__(self, value: str, description=None): - super(MatchedRule, self).__init__(value, description=description) + super().__init__(value, description=description) self.name = "match" class Characteristic(Feature): def __init__(self, value: str, description=None): - super(Characteristic, self).__init__(value, description=description) + super().__init__(value, description=description) class String(Feature): def __init__(self, value: str, description=None): - super(String, self).__init__(value, description=description) + super().__init__(value, description=description) class Class(Feature): def __init__(self, value: str, description=None): - super(Class, self).__init__(value, description=description) + super().__init__(value, description=description) class Namespace(Feature): def __init__(self, value: str, description=None): - super(Namespace, self).__init__(value, description=description) + super().__init__(value, description=description) class Substring(String): def __init__(self, value: str, description=None): - super(Substring, self).__init__(value, description=description) + super().__init__(value, description=description) self.value = value def evaluate(self, ctx, short_circuit=True): @@ -200,8 +201,9 @@ class Substring(String): # mapping from string value to list of locations. # will unique the locations later on. - matches = collections.defaultdict(list) + matches: typing.DefaultDict[str, Set[Address]] = collections.defaultdict(set) + assert isinstance(self.value, str) for feature, locations in ctx.items(): if not isinstance(feature, (String,)): continue @@ -211,31 +213,27 @@ class Substring(String): raise ValueError("unexpected feature value type") if self.value in feature.value: - matches[feature.value].extend(locations) + matches[feature.value].update(locations) if short_circuit: # we found one matching string, thats sufficient to match. # don't collect other matching strings in this mode. break if matches: - # finalize: defaultdict -> dict - # which makes json serialization easier - matches = dict(matches) - # collect all locations locations = set() - for s in matches.keys(): - matches[s] = list(set(matches[s])) - locations.update(matches[s]) + for locs in matches.values(): + locations.update(locs) # unlike other features, we cannot return put a reference to `self` directly in a `Result`. # this is because `self` may match on many strings, so we can't stuff the matched value into it. # instead, return a new instance that has a reference to both the substring and the matched values. - return Result(True, _MatchedSubstring(self, matches), [], locations=locations) + return Result(True, _MatchedSubstring(self, dict(matches)), [], locations=locations) else: return Result(False, _MatchedSubstring(self, {}), []) def __str__(self): + assert isinstance(self.value, str) return "substring(%s)" % self.value @@ -253,7 +251,7 @@ class _MatchedSubstring(Substring): substring: the substring feature that matches. match: mapping from matching string to its locations. """ - super(_MatchedSubstring, self).__init__(str(substring.value), description=substring.description) + super().__init__(str(substring.value), description=substring.description) # we want this to collide with the name of `Substring` above, # so that it works nicely with the renderers. self.name = "substring" @@ -261,6 +259,7 @@ class _MatchedSubstring(Substring): self.matches = matches def __str__(self): + assert isinstance(self.value, str) return 'substring("%s", matches = %s)' % ( self.value, ", ".join(map(lambda s: '"' + s + '"', (self.matches or {}).keys())), @@ -269,7 +268,7 @@ class _MatchedSubstring(Substring): class Regex(String): def __init__(self, value: str, description=None): - super(Regex, self).__init__(value, description=description) + super().__init__(value, description=description) self.value = value pat = self.value[len("/") : -len("/")] @@ -279,12 +278,12 @@ class Regex(String): flags |= re.IGNORECASE try: self.re = re.compile(pat, flags) - except re.error: + except re.error as exc: if value.endswith("/i"): value = value[: -len("i")] raise ValueError( "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % value - ) + ) from exc def evaluate(self, ctx, short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 @@ -292,7 +291,7 @@ class Regex(String): # mapping from string value to list of locations. # will unique the locations later on. - matches = collections.defaultdict(list) + matches: typing.DefaultDict[str, Set[Address]] = collections.defaultdict(set) for feature, locations in ctx.items(): if not isinstance(feature, (String,)): @@ -307,32 +306,28 @@ class Regex(String): # using this mode cleans is more convenient for rule authors, # so that they don't have to prefix/suffix their terms like: /.*foo.*/. if self.re.search(feature.value): - matches[feature.value].extend(locations) + matches[feature.value].update(locations) if short_circuit: # we found one matching string, thats sufficient to match. # don't collect other matching strings in this mode. break if matches: - # finalize: defaultdict -> dict - # which makes json serialization easier - matches = dict(matches) - # collect all locations locations = set() - for s in matches.keys(): - matches[s] = list(set(matches[s])) - locations.update(matches[s]) + for locs in matches.values(): + locations.update(locs) # unlike other features, we cannot return put a reference to `self` directly in a `Result`. # this is because `self` may match on many strings, so we can't stuff the matched value into it. # instead, return a new instance that has a reference to both the regex and the matched values. # see #262. - return Result(True, _MatchedRegex(self, matches), [], locations=locations) + return Result(True, _MatchedRegex(self, dict(matches)), [], locations=locations) else: return Result(False, _MatchedRegex(self, {}), []) def __str__(self): + assert isinstance(self.value, str) return "regex(string =~ %s)" % self.value @@ -350,7 +345,7 @@ class _MatchedRegex(Regex): regex: the regex feature that matches. matches: mapping from matching string to its locations. """ - super(_MatchedRegex, self).__init__(str(regex.value), description=regex.description) + super().__init__(str(regex.value), description=regex.description) # we want this to collide with the name of `Regex` above, # so that it works nicely with the renderers. self.name = "regex" @@ -358,6 +353,7 @@ class _MatchedRegex(Regex): self.matches = matches def __str__(self): + assert isinstance(self.value, str) return "regex(string =~ %s, matches = %s)" % ( self.value, ", ".join(map(lambda s: '"' + s + '"', (self.matches or {}).keys())), @@ -373,23 +369,26 @@ class StringFactory: class Bytes(Feature): def __init__(self, value: bytes, description=None): - super(Bytes, self).__init__(value, description=description) + super().__init__(value, description=description) self.value = value def evaluate(self, ctx, **kwargs): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.bytes"] += 1 + assert isinstance(self.value, bytes) for feature, locations in ctx.items(): if not isinstance(feature, (Bytes,)): continue + assert isinstance(feature.value, bytes) if feature.value.startswith(self.value): return Result(True, self, [], locations=locations) return Result(False, self, []) def get_value_str(self): + assert isinstance(self.value, bytes) return hex_string(bytes_to_str(self.value)) @@ -403,7 +402,7 @@ VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY) class Arch(Feature): def __init__(self, value: str, description=None): - super(Arch, self).__init__(value, description=description) + super().__init__(value, description=description) self.name = "arch" @@ -418,7 +417,7 @@ VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS, OS_ANY}) class OS(Feature): def __init__(self, value: str, description=None): - super(OS, self).__init__(value, description=description) + super().__init__(value, description=description) self.name = "os" @@ -436,7 +435,7 @@ FORMAT_UNKNOWN = "unknown" class Format(Feature): def __init__(self, value: str, description=None): - super(Format, self).__init__(value, description=description) + super().__init__(value, description=description) self.name = "format" diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index df1f706d..3be983ed 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -87,7 +87,7 @@ class FeatureExtractor: # for example, the Vivisect feature extract might require the vw and/or path. # this base class doesn't know what to do with that info, though. # - super(FeatureExtractor, self).__init__() + super().__init__() @abc.abstractmethod def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: diff --git a/capa/features/extractors/dnfile/extractor.py b/capa/features/extractors/dnfile/extractor.py index 4f23a34c..bd4b9c9e 100644 --- a/capa/features/extractors/dnfile/extractor.py +++ b/capa/features/extractors/dnfile/extractor.py @@ -8,24 +8,74 @@ from __future__ import annotations -from typing import List, Tuple, Iterator +from enum import Enum +from typing import Dict, List, Tuple, Union, Iterator, Optional import dnfile +from dncil.cil.opcode import OpCodes import capa.features.extractors import capa.features.extractors.dnfile.file import capa.features.extractors.dnfile.insn +import capa.features.extractors.dnfile.function from capa.features.common import Feature from capa.features.address import NO_ADDRESS, Address, DNTokenAddress, DNTokenOffsetAddress +from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor -from capa.features.extractors.dnfile.helpers import get_dotnet_managed_method_bodies +from capa.features.extractors.dnfile.helpers import ( + get_dotnet_types, + get_dotnet_fields, + get_dotnet_managed_imports, + get_dotnet_managed_methods, + get_dotnet_unmanaged_imports, + get_dotnet_managed_method_bodies, +) + + +class DnFileFeatureExtractorCache: + def __init__(self, pe: dnfile.dnPE): + self.imports: Dict[int, Union[DnType, DnUnmanagedMethod]] = {} + self.native_imports: Dict[int, Union[DnType, DnUnmanagedMethod]] = {} + self.methods: Dict[int, Union[DnType, DnUnmanagedMethod]] = {} + self.fields: Dict[int, Union[DnType, DnUnmanagedMethod]] = {} + self.types: Dict[int, Union[DnType, DnUnmanagedMethod]] = {} + + for import_ in get_dotnet_managed_imports(pe): + self.imports[import_.token] = import_ + for native_import in get_dotnet_unmanaged_imports(pe): + self.native_imports[native_import.token] = native_import + for method in get_dotnet_managed_methods(pe): + self.methods[method.token] = method + for field in get_dotnet_fields(pe): + self.fields[field.token] = field + for type_ in get_dotnet_types(pe): + self.types[type_.token] = type_ + + def get_import(self, token: int) -> Optional[Union[DnType, DnUnmanagedMethod]]: + return self.imports.get(token, None) + + def get_native_import(self, token: int) -> Optional[Union[DnType, DnUnmanagedMethod]]: + return self.native_imports.get(token, None) + + def get_method(self, token: int) -> Optional[Union[DnType, DnUnmanagedMethod]]: + return self.methods.get(token, None) + + def get_field(self, token: int) -> Optional[Union[DnType, DnUnmanagedMethod]]: + return self.fields.get(token, None) + + def get_type(self, token: int) -> Optional[Union[DnType, DnUnmanagedMethod]]: + return self.types.get(token, None) class DnfileFeatureExtractor(FeatureExtractor): def __init__(self, path: str): - super(DnfileFeatureExtractor, self).__init__() + super().__init__() self.pe: dnfile.dnPE = dnfile.dnPE(path) + # pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction + # most relevant at instruction scope + self.token_cache: DnFileFeatureExtractorCache = DnFileFeatureExtractorCache(self.pe) + # pre-compute these because we'll yield them at *every* scope. self.global_features: List[Tuple[Feature, Address]] = [] self.global_features.extend(capa.features.extractors.dotnetfile.extract_file_os(pe=self.pe)) @@ -41,12 +91,45 @@ class DnfileFeatureExtractor(FeatureExtractor): yield from capa.features.extractors.dnfile.file.extract_features(self.pe) def get_functions(self) -> Iterator[FunctionHandle]: - for token, f in get_dotnet_managed_method_bodies(self.pe): - yield FunctionHandle(address=DNTokenAddress(token), inner=f, ctx={"pe": self.pe}) + # create a method lookup table + methods: Dict[Address, FunctionHandle] = {} + for (token, method) in get_dotnet_managed_method_bodies(self.pe): + fh: FunctionHandle = FunctionHandle( + address=DNTokenAddress(token), + inner=method, + ctx={"pe": self.pe, "calls_from": set(), "calls_to": set(), "cache": self.token_cache}, + ) - def extract_function_features(self, f): - # TODO - yield from [] + # method tokens should be unique + assert fh.address not in methods.keys() + methods[fh.address] = fh + + # calculate unique calls to/from each method + for fh in methods.values(): + for insn in fh.inner.instructions: + if insn.opcode not in ( + OpCodes.Call, + OpCodes.Callvirt, + OpCodes.Jmp, + OpCodes.Newobj, + ): + continue + + address: DNTokenAddress = DNTokenAddress(insn.operand.value) + + # record call to destination method; note: we only consider MethodDef methods for destinations + dest: Optional[FunctionHandle] = methods.get(address, None) + if dest is not None: + dest.ctx["calls_to"].add(fh.address) + + # record call from source method; note: we record all unique calls from a MethodDef method, not just + # those calls to other MethodDef methods e.g. calls to imported MemberRef methods + fh.ctx["calls_from"].add(address) + + yield from methods.values() + + def extract_function_features(self, fh) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.dnfile.function.extract_features(fh) def get_basic_blocks(self, f) -> Iterator[BBHandle]: # each dotnet method is considered 1 basic block diff --git a/capa/features/extractors/dnfile/function.py b/capa/features/extractors/dnfile/function.py new file mode 100644 index 00000000..0d698719 --- /dev/null +++ b/capa/features/extractors/dnfile/function.py @@ -0,0 +1,50 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +from __future__ import annotations + +import logging +from typing import Tuple, Iterator + +from capa.features.common import Feature, Characteristic +from capa.features.address import Address +from capa.features.extractors.base_extractor import FunctionHandle + +logger = logging.getLogger(__name__) + + +def extract_function_calls_to(fh: FunctionHandle) -> Iterator[Tuple[Characteristic, Address]]: + """extract callers to a function""" + for dest in fh.ctx["calls_to"]: + yield Characteristic("calls to"), dest + + +def extract_function_calls_from(fh: FunctionHandle) -> Iterator[Tuple[Characteristic, Address]]: + """extract callers from a function""" + for src in fh.ctx["calls_from"]: + yield Characteristic("calls from"), src + + +def extract_recursive_call(fh: FunctionHandle) -> Iterator[Tuple[Characteristic, Address]]: + """extract recursive function call""" + if fh.address in fh.ctx["calls_to"]: + yield Characteristic("recursive call"), fh.address + + +def extract_function_loop(fh: FunctionHandle) -> Iterator[Tuple[Characteristic, Address]]: + """extract loop indicators from a function""" + raise NotImplementedError() + + +def extract_features(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: + for func_handler in FUNCTION_HANDLERS: + for (feature, addr) in func_handler(fh): + yield feature, addr + + +FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_calls_from, extract_recursive_call) diff --git a/capa/features/extractors/dnfile/helpers.py b/capa/features/extractors/dnfile/helpers.py index 3fef794d..d79d802b 100644 --- a/capa/features/extractors/dnfile/helpers.py +++ b/capa/features/extractors/dnfile/helpers.py @@ -9,8 +9,7 @@ from __future__ import annotations import logging -from enum import Enum -from typing import Any, Tuple, Iterator, Optional +from typing import Dict, Tuple, Union, Iterator, Optional import dnfile from dncil.cil.body import CilMethodBody @@ -19,12 +18,10 @@ from dncil.clr.token import Token, StringToken, InvalidToken from dncil.cil.body.reader import CilMethodBodyReaderBase from capa.features.common import FeatureAccess +from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod logger = logging.getLogger(__name__) -# key indexes to dotnet metadata tables -DOTNET_META_TABLES_BY_INDEX = {table.value: table.name for table in dnfile.enums.MetadataTables} - class DnfileMethodBodyReader(CilMethodBodyReaderBase): def __init__(self, pe: dnfile.dnPE, row: dnfile.mdtable.MethodDefRow): @@ -44,85 +41,20 @@ class DnfileMethodBodyReader(CilMethodBodyReaderBase): return self.offset -class DnType(object): - def __init__(self, token: int, class_: str, namespace: str = "", member: str = "", access: Optional[str] = None): - self.token = token - self.access = access - self.namespace = namespace - self.class_ = class_ - if member == ".ctor": - member = "ctor" - if member == ".cctor": - member = "cctor" - self.member = member - - def __hash__(self): - return hash((self.token, self.access, self.namespace, self.class_, self.member)) - - def __eq__(self, other): - return ( - self.token == other.token - and self.access == other.access - and self.namespace == other.namespace - and self.class_ == other.class_ - and self.member == other.member - ) - - def __str__(self): - return DnType.format_name(self.class_, namespace=self.namespace, member=self.member) - - def __repr__(self): - return str(self) - - @staticmethod - def format_name(class_: str, namespace: str = "", member: str = ""): - # like File::OpenRead - name: str = f"{class_}::{member}" if member else class_ - if namespace: - # like System.IO.File::OpenRead - name = f"{namespace}.{name}" - return name - - -class DnUnmanagedMethod: - def __init__(self, token: int, module: str, method: str): - self.token: int = token - self.module: str = module - self.method: str = method - - def __hash__(self): - return hash((self.token, self.module, self.method)) - - def __eq__(self, other): - return self.token == other.token and self.module == other.module and self.method == other.method - - def __str__(self): - return DnUnmanagedMethod.format_name(self.module, self.method) - - def __repr__(self): - return str(self) - - @staticmethod - def format_name(module, method): - return f"{module}.{method}" - - -def resolve_dotnet_token(pe: dnfile.dnPE, token: Token) -> Any: +def resolve_dotnet_token(pe: dnfile.dnPE, token: Token) -> Union[dnfile.base.MDTableRow, InvalidToken, str]: """map generic token to string or table row""" + assert pe.net is not None + assert pe.net.mdtables is not None + if isinstance(token, StringToken): user_string: Optional[str] = read_dotnet_user_string(pe, token) if user_string is None: return InvalidToken(token.value) return user_string - table_name: str = DOTNET_META_TABLES_BY_INDEX.get(token.table, "") - if not table_name: - # table_index is not valid - return InvalidToken(token.value) - - table: Any = getattr(pe.net.mdtables, table_name, None) + table: Optional[dnfile.base.ClrMetaDataTable] = pe.net.mdtables.tables.get(token.table, None) if table is None: - # table index is valid but table is not present + # table index is not valid return InvalidToken(token.value) try: @@ -137,16 +69,23 @@ def read_dotnet_method_body(pe: dnfile.dnPE, row: dnfile.mdtable.MethodDefRow) - try: return CilMethodBody(DnfileMethodBodyReader(pe, row)) except MethodBodyFormatError as e: - logger.warning("failed to parse managed method body @ 0x%08x (%s)" % (row.Rva, e)) + logger.debug("failed to parse managed method body @ 0x%08x (%s)", row.Rva, e) return None def read_dotnet_user_string(pe: dnfile.dnPE, token: StringToken) -> Optional[str]: """read user string from #US stream""" + assert pe.net is not None + + if pe.net.user_strings is None: + # stream may not exist (seen in obfuscated .NET) + logger.debug("#US stream does not exist for stream index 0x%08x", token.rid) + return None + try: user_string: Optional[dnfile.stream.UserString] = pe.net.user_strings.get_us(token.rid) except UnicodeDecodeError as e: - logger.warning("failed to decode #US stream index 0x%08x (%s)" % (token.rid, e)) + logger.debug("failed to decode #US stream index 0x%08x (%s)", token.rid, e) return None if user_string is None: @@ -169,11 +108,73 @@ def get_dotnet_managed_imports(pe: dnfile.dnPE) -> Iterator[DnType]: TypeName (index into String heap) TypeNamespace (index into String heap) """ - for (rid, row) in enumerate(iter_dotnet_table(pe, "MemberRef")): - if not isinstance(row.Class.row, dnfile.mdtable.TypeRefRow): + for (rid, member_ref) in iter_dotnet_table(pe, dnfile.mdtable.MemberRef.number): + assert isinstance(member_ref, dnfile.mdtable.MemberRefRow) + + if not isinstance(member_ref.Class.row, dnfile.mdtable.TypeRefRow): + # only process class imports from TypeRef table continue - token: int = calculate_dotnet_token_value(pe.net.mdtables.MemberRef.number, rid + 1) - yield DnType(token, row.Class.row.TypeName, namespace=row.Class.row.TypeNamespace, member=row.Name) + + token: int = calculate_dotnet_token_value(dnfile.mdtable.MemberRef.number, rid) + access: Optional[str] + + # assume .NET imports starting with get_/set_ are used to access a property + if member_ref.Name.startswith("get_"): + access = FeatureAccess.READ + elif member_ref.Name.startswith("set_"): + access = FeatureAccess.WRITE + else: + access = None + + member_ref_name: str = member_ref.Name + if member_ref_name.startswith(("get_", "set_")): + # remove get_/set_ from MemberRef name + member_ref_name = member_ref_name[4:] + + yield DnType( + token, + member_ref.Class.row.TypeName, + namespace=member_ref.Class.row.TypeNamespace, + member=member_ref_name, + access=access, + ) + + +def get_dotnet_methoddef_property_accessors(pe: dnfile.dnPE) -> Iterator[Tuple[int, str]]: + """get MethodDef methods used to access properties + + see https://www.ntcore.com/files/dotnetformat.htm + + 24 - MethodSemantics Table + Links Events and Properties to specific methods. For example one Event can be associated to more methods. A property uses this table to associate get/set methods. + Semantics (a 2-byte bitmask of type MethodSemanticsAttributes) + Method (index into the MethodDef table) + Association (index into the Event or Property table; more precisely, a HasSemantics coded index) + """ + for (rid, method_semantics) in iter_dotnet_table(pe, dnfile.mdtable.MethodSemantics.number): + assert isinstance(method_semantics, dnfile.mdtable.MethodSemanticsRow) + + if method_semantics.Association.row is None: + logger.debug("MethodSemantics[0x%X] Association row is None", rid) + continue + + if isinstance(method_semantics.Association.row, dnfile.mdtable.EventRow): + # ignore events + logger.debug("MethodSemantics[0x%X] ignoring Event", rid) + continue + + if method_semantics.Method.table is None: + logger.debug("MethodSemantics[0x%X] Method table is None", rid) + continue + + token: int = calculate_dotnet_token_value( + method_semantics.Method.table.number, method_semantics.Method.row_index + ) + + if method_semantics.Semantics.msSetter: + yield token, FeatureAccess.WRITE + elif method_semantics.Semantics.msGetter: + yield token, FeatureAccess.READ def get_dotnet_managed_methods(pe: dnfile.dnPE) -> Iterator[DnType]: @@ -187,90 +188,72 @@ def get_dotnet_managed_methods(pe: dnfile.dnPE) -> Iterator[DnType]: TypeNamespace (index into String heap) MethodList (index into MethodDef table; it marks the first of a continguous run of Methods owned by this Type) """ - for row in iter_dotnet_table(pe, "TypeDef"): - for index in row.MethodList: - token = calculate_dotnet_token_value(index.table.number, index.row_index) - yield DnType(token, row.TypeName, namespace=row.TypeNamespace, member=index.row.Name) + accessor_map: Dict[int, str] = {} + for (methoddef, methoddef_access) in get_dotnet_methoddef_property_accessors(pe): + accessor_map[methoddef] = methoddef_access + + for (rid, typedef) in iter_dotnet_table(pe, dnfile.mdtable.TypeDef.number): + assert isinstance(typedef, dnfile.mdtable.TypeDefRow) + + for (idx, method) in enumerate(typedef.MethodList): + if method.table is None: + logger.debug("TypeDef[0x%X] MethodList[0x%X] table is None", rid, idx) + continue + if method.row is None: + logger.debug("TypeDef[0x%X] MethodList[0x%X] row is None", rid, idx) + continue + + token: int = calculate_dotnet_token_value(method.table.number, method.row_index) + access: Optional[str] = accessor_map.get(token, None) + + method_name: str = method.row.Name + if method_name.startswith(("get_", "set_")): + # remove get_/set_ + method_name = method_name[4:] + + yield DnType(token, typedef.TypeName, namespace=typedef.TypeNamespace, member=method_name, access=access) def get_dotnet_fields(pe: dnfile.dnPE) -> Iterator[DnType]: - """get fields from TypeDef table""" - for row in iter_dotnet_table(pe, "TypeDef"): - for index in row.FieldList: - token = calculate_dotnet_token_value(index.table.number, index.row_index) - yield DnType(token, row.TypeName, namespace=row.TypeNamespace, member=index.row.Name) - - -def get_dotnet_property_map( - pe: dnfile.dnPE, property_row: dnfile.mdtable.PropertyRow -) -> Optional[dnfile.mdtable.TypeDefRow]: - """get property map from PropertyMap table + """get fields from TypeDef table see https://www.ntcore.com/files/dotnetformat.htm - 21 - PropertyMap Table - List of Properties owned by a specific class. - Parent (index into the TypeDef table) - PropertyList (index into Property table). It marks the first of a contiguous run of Properties owned by Parent. The run continues to the smaller of: - the last row of the Property table - the next run of Properties, found by inspecting the PropertyList of the next row in this PropertyMap table + 02 - TypeDef Table + Each row represents a class in the current assembly. + TypeName (index into String heap) + TypeNamespace (index into String heap) + FieldList (index into Field table; it marks the first of a continguous run of Fields owned by this Type) """ - for row in iter_dotnet_table(pe, "PropertyMap"): - for index in row.PropertyList: - if index.row.Name == property_row.Name: - return row.Parent.row - return None + for (rid, typedef) in iter_dotnet_table(pe, dnfile.mdtable.TypeDef.number): + assert isinstance(typedef, dnfile.mdtable.TypeDefRow) - -def get_dotnet_properties(pe: dnfile.dnPE) -> Iterator[DnType]: - """get property from MethodSemantics table - - see https://www.ntcore.com/files/dotnetformat.htm - - 24 - MethodSemantics Table - Links Events and Properties to specific methods. For example one Event can be associated to more methods. A property uses this table to associate get/set methods. - Semantics (a 2-byte bitmask of type MethodSemanticsAttributes) - Method (index into the MethodDef table) - Association (index into the Event or Property table; more precisely, a HasSemantics coded index) - """ - for row in iter_dotnet_table(pe, "MethodSemantics"): - typedef_row = get_dotnet_property_map(pe, row.Association.row) - if typedef_row is None: - continue - - token = calculate_dotnet_token_value(row.Method.table.number, row.Method.row_index) - - if row.Semantics.msSetter: - access = FeatureAccess.WRITE - elif row.Semantics.msGetter: - access = FeatureAccess.READ - else: - access = None - - yield DnType( - token, - typedef_row.TypeName, - access=access, - namespace=typedef_row.TypeNamespace, - member=row.Association.row.Name, - ) + for (idx, field) in enumerate(typedef.FieldList): + if field.table is None: + logger.debug("TypeDef[0x%X] FieldList[0x%X] table is None", rid, idx) + continue + if field.row is None: + logger.debug("TypeDef[0x%X] FieldList[0x%X] row is None", rid, idx) + continue + token: int = calculate_dotnet_token_value(field.table.number, field.row_index) + yield DnType(token, typedef.TypeName, namespace=typedef.TypeNamespace, member=field.row.Name) def get_dotnet_managed_method_bodies(pe: dnfile.dnPE) -> Iterator[Tuple[int, CilMethodBody]]: """get managed methods from MethodDef table""" - if not hasattr(pe.net.mdtables, "MethodDef"): - return + for (rid, method_def) in iter_dotnet_table(pe, dnfile.mdtable.MethodDef.number): + assert isinstance(method_def, dnfile.mdtable.MethodDefRow) - for (rid, row) in enumerate(pe.net.mdtables.MethodDef): - if not row.ImplFlags.miIL or any((row.Flags.mdAbstract, row.Flags.mdPinvokeImpl)): + if not method_def.ImplFlags.miIL or any((method_def.Flags.mdAbstract, method_def.Flags.mdPinvokeImpl)): # skip methods that do not have a method body continue - body: Optional[CilMethodBody] = read_dotnet_method_body(pe, row) + body: Optional[CilMethodBody] = read_dotnet_method_body(pe, method_def) if body is None: + logger.debug("MethodDef[0x%X] method body is None", rid) continue - token: int = calculate_dotnet_token_value(dnfile.enums.MetadataTables.MethodDef.value, rid + 1) + token: int = calculate_dotnet_token_value(dnfile.mdtable.MethodDef.number, rid) yield token, body @@ -285,14 +268,29 @@ def get_dotnet_unmanaged_imports(pe: dnfile.dnPE) -> Iterator[DnUnmanagedMethod] ImportName (index into the String heap) ImportScope (index into the ModuleRef table) """ - for row in iter_dotnet_table(pe, "ImplMap"): - module: str = row.ImportScope.row.Name - method: str = row.ImportName + for (rid, impl_map) in iter_dotnet_table(pe, dnfile.mdtable.ImplMap.number): + assert isinstance(impl_map, dnfile.mdtable.ImplMapRow) + + module: str + if impl_map.ImportScope.row is None: + logger.debug("ImplMap[0x%X] ImportScope row is None", rid) + module = "" + else: + module = impl_map.ImportScope.row.Name + method: str = impl_map.ImportName + + member_forward_table: int + if impl_map.MemberForwarded.table is None: + logger.debug("ImplMap[0x%X] MemberForwarded table is None", rid) + continue + else: + member_forward_table = impl_map.MemberForwarded.table.number + member_forward_row: int = impl_map.MemberForwarded.row_index # ECMA says "Each row of the ImplMap table associates a row in the MethodDef table (MemberForwarded) with the # name of a routine (ImportName) in some unmanaged DLL (ImportScope)"; so we calculate and map the MemberForwarded # MethodDef table token to help us later record native import method calls made from CIL - token: int = calculate_dotnet_token_value(row.MemberForwarded.table.number, row.MemberForwarded.row_index) + token: int = calculate_dotnet_token_value(member_forward_table, member_forward_row) # like Kernel32.dll if module and "." in module: @@ -302,20 +300,36 @@ def get_dotnet_unmanaged_imports(pe: dnfile.dnPE) -> Iterator[DnUnmanagedMethod] yield DnUnmanagedMethod(token, module, method) +def get_dotnet_types(pe: dnfile.dnPE) -> Iterator[DnType]: + """get .NET types from TypeDef and TypeRef tables""" + for (rid, typedef) in iter_dotnet_table(pe, dnfile.mdtable.TypeDef.number): + assert isinstance(typedef, dnfile.mdtable.TypeDefRow) + + typedef_token: int = calculate_dotnet_token_value(dnfile.mdtable.TypeDef.number, rid) + yield DnType(typedef_token, typedef.TypeName, namespace=typedef.TypeNamespace) + + for (rid, typeref) in iter_dotnet_table(pe, dnfile.mdtable.TypeRef.number): + assert isinstance(typeref, dnfile.mdtable.TypeRefRow) + + typeref_token: int = calculate_dotnet_token_value(dnfile.mdtable.TypeRef.number, rid) + yield DnType(typeref_token, typeref.TypeName, namespace=typeref.TypeNamespace) + + def calculate_dotnet_token_value(table: int, rid: int) -> int: return ((table & 0xFF) << Token.TABLE_SHIFT) | (rid & Token.RID_MASK) -def is_dotnet_table_valid(pe: dnfile.dnPE, table_name: str) -> bool: - return bool(getattr(pe.net.mdtables, table_name, None)) - - def is_dotnet_mixed_mode(pe: dnfile.dnPE) -> bool: + assert pe.net is not None + assert pe.net.Flags is not None + return not bool(pe.net.Flags.CLR_ILONLY) -def iter_dotnet_table(pe: dnfile.dnPE, name: str) -> Iterator[Any]: - if not is_dotnet_table_valid(pe, name): - return - for row in getattr(pe.net.mdtables, name): - yield row +def iter_dotnet_table(pe: dnfile.dnPE, table_index: int) -> Iterator[Tuple[int, dnfile.base.MDTableRow]]: + assert pe.net is not None + assert pe.net.mdtables is not None + + for (rid, row) in enumerate(pe.net.mdtables.tables.get(table_index, [])): + # .NET tables are 1-indexed + yield rid + 1, row diff --git a/capa/features/extractors/dnfile/insn.py b/capa/features/extractors/dnfile/insn.py index da88464f..fb95d5cd 100644 --- a/capa/features/extractors/dnfile/insn.py +++ b/capa/features/extractors/dnfile/insn.py @@ -8,113 +8,76 @@ from __future__ import annotations -from typing import Any, Dict, Tuple, Union, Iterator, Optional +import logging +from typing import TYPE_CHECKING, Any, Dict, Tuple, Union, Iterator, Optional + +if TYPE_CHECKING: + from capa.features.extractors.dnfile.extractor import DnFileFeatureExtractorCache import dnfile -from dncil.cil.body import CilMethodBody from dncil.clr.token import Token, StringToken, InvalidToken from dncil.cil.opcode import OpCodes -from dncil.cil.instruction import Instruction import capa.features.extractors.helpers from capa.features.insn import API, Number, Property from capa.features.common import Class, String, Feature, Namespace, FeatureAccess, Characteristic from capa.features.address import Address +from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle from capa.features.extractors.dnfile.helpers import ( - DnType, - DnUnmanagedMethod, - get_dotnet_fields, resolve_dotnet_token, - get_dotnet_properties, read_dotnet_user_string, - get_dotnet_managed_imports, - get_dotnet_managed_methods, - get_dotnet_unmanaged_imports, + calculate_dotnet_token_value, ) -METHODDEF_TABLE = dnfile.mdtable.MethodDef.number -MEMBERREF_TABLE = dnfile.mdtable.MemberRef.number -FIELD_TABLE = dnfile.mdtable.Field.number +logger = logging.getLogger(__name__) -def get_managed_imports(ctx: Dict) -> Dict: - if "managed_imports_cache" not in ctx: - ctx["managed_imports_cache"] = {} - for method in get_dotnet_managed_imports(ctx["pe"]): - ctx["managed_imports_cache"][method.token] = method - return ctx["managed_imports_cache"] +def get_callee( + pe: dnfile.dnPE, cache: DnFileFeatureExtractorCache, token: Token +) -> Optional[Union[DnType, DnUnmanagedMethod]]: + """map .NET token to un/managed (generic) method""" + token_: int + if token.table == dnfile.mdtable.MethodSpec.number: + # map MethodSpec to MethodDef or MemberRef + row: Union[dnfile.base.MDTableRow, InvalidToken, str] = resolve_dotnet_token(pe, token) + assert isinstance(row, dnfile.mdtable.MethodSpecRow) + if row.Method.table is None: + logger.debug("MethodSpec[0x%X] Method table is None", token.rid) + return None -def get_unmanaged_imports(ctx: Dict) -> Dict: - if "unmanaged_imports_cache" not in ctx: - ctx["unmanaged_imports_cache"] = {} - for imp in get_dotnet_unmanaged_imports(ctx["pe"]): - ctx["unmanaged_imports_cache"][imp.token] = imp - return ctx["unmanaged_imports_cache"] + token_ = calculate_dotnet_token_value(row.Method.table.number, row.Method.row_index) + else: + token_ = token.value - -def get_methods(ctx: Dict) -> Dict: - if "methods_cache" not in ctx: - ctx["methods_cache"] = {} - for method in get_dotnet_managed_methods(ctx["pe"]): - ctx["methods_cache"][method.token] = method - return ctx["methods_cache"] - - -def get_callee(ctx: Dict, token: int) -> Union[DnType, DnUnmanagedMethod, None]: - """map dotnet token to un/managed method""" - callee: Union[DnType, DnUnmanagedMethod, None] = get_managed_imports(ctx).get(token, None) + callee: Optional[Union[DnType, DnUnmanagedMethod]] = cache.get_import(token_) if callee is None: # we must check unmanaged imports before managed methods because we map forwarded managed methods # to their unmanaged imports; we prefer a forwarded managed method be mapped to its unmanaged import for analysis - callee = get_unmanaged_imports(ctx).get(token, None) + callee = cache.get_native_import(token_) if callee is None: - callee = get_methods(ctx).get(token, None) + callee = cache.get_method(token_) return callee -def get_properties(ctx: Dict) -> Dict: - if "properties_cache" not in ctx: - ctx["properties_cache"] = {} - for prop in get_dotnet_properties(ctx["pe"]): - ctx["properties_cache"][prop.token] = prop - return ctx["properties_cache"] - - -def get_fields(ctx: Dict) -> Dict: - if "fields_cache" not in ctx: - ctx["fields_cache"] = {} - for field in get_dotnet_fields(ctx["pe"]): - ctx["fields_cache"][field.token] = field - return ctx["fields_cache"] - - def extract_insn_api_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: """parse instruction API features""" - insn: Instruction = ih.inner - - if insn.opcode not in (OpCodes.Call, OpCodes.Callvirt, OpCodes.Jmp, OpCodes.Calli, OpCodes.Newobj): - return - - callee: Union[DnType, DnUnmanagedMethod, None] = get_callee(fh.ctx, insn.operand.value) - if callee is None: + if ih.inner.opcode not in ( + OpCodes.Call, + OpCodes.Callvirt, + OpCodes.Jmp, + OpCodes.Newobj, + ): return + callee: Optional[Union[DnType, DnUnmanagedMethod]] = get_callee(fh.ctx["pe"], fh.ctx["cache"], ih.inner.operand) if isinstance(callee, DnType): - if callee.member.startswith(("get_", "set_")): - if insn.operand.table == METHODDEF_TABLE: - # check if the method belongs to the MethodDef table and whether it is used to access a property - if get_properties(fh.ctx).get(insn.operand.value, None) is not None: - return - elif insn.operand.table == MEMBERREF_TABLE: - # if the method belongs to the MemberRef table, we assume it is used to access a property - return - - # like System.IO.File::Delete - yield API(str(callee)), ih.address - - else: + # ignore methods used to access properties + if callee.access is None: + # like System.IO.File::Delete + yield API(str(callee)), ih.address + elif isinstance(callee, DnUnmanagedMethod): # like kernel32.CreateFileA for name in capa.features.extractors.helpers.generate_symbols(callee.module, callee.method): yield API(name), ih.address @@ -122,52 +85,30 @@ def extract_insn_api_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iterato def extract_insn_property_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: """parse instruction property features""" - insn: Instruction = ih.inner - name: Optional[str] = None access: Optional[str] = None - if insn.opcode in (OpCodes.Call, OpCodes.Callvirt, OpCodes.Jmp, OpCodes.Calli): - if insn.operand.table == METHODDEF_TABLE: - # check if the method belongs to the MethodDef table and whether it is used to access a property - prop = get_properties(fh.ctx).get(insn.operand.value, None) - if prop is not None: - name = str(prop) - access = prop.access + if ih.inner.opcode in (OpCodes.Call, OpCodes.Callvirt, OpCodes.Jmp): + # property access via MethodDef or MemberRef + callee: Optional[Union[DnType, DnUnmanagedMethod]] = get_callee(fh.ctx["pe"], fh.ctx["cache"], ih.inner.operand) + if isinstance(callee, DnType): + if callee.access is not None: + name = str(callee) + access = callee.access - elif insn.operand.table == MEMBERREF_TABLE: - # if the method belongs to the MemberRef table, we assume it is used to access a property - row: Any = resolve_dotnet_token(fh.ctx["pe"], insn.operand) - if row is None: - return - if not isinstance(row.Class.row, (dnfile.mdtable.TypeRefRow, dnfile.mdtable.TypeDefRow)): - return - if not row.Name.startswith(("get_", "set_")): - return + elif ih.inner.opcode in (OpCodes.Ldfld, OpCodes.Ldflda, OpCodes.Ldsfld, OpCodes.Ldsflda): + # property read via Field + read_field: Optional[Union[DnType, DnUnmanagedMethod]] = fh.ctx["cache"].get_field(ih.inner.operand.value) + if read_field is not None: + name = str(read_field) + access = FeatureAccess.READ - name = DnType.format_name( - row.Class.row.TypeName, namespace=row.Class.row.TypeNamespace, member=row.Name[4:] - ) - if row.Name.startswith("get_"): - access = FeatureAccess.READ - elif row.Name.startswith("set_"): - access = FeatureAccess.WRITE - - elif insn.opcode in (OpCodes.Ldfld, OpCodes.Ldflda, OpCodes.Ldsfld, OpCodes.Ldsflda): - if insn.operand.table == FIELD_TABLE: - # determine whether the operand is a field by checking if it belongs to the Field table - read_field: Optional[DnType] = get_fields(fh.ctx).get(insn.operand.value, None) - if read_field: - name = str(read_field) - access = FeatureAccess.READ - - elif insn.opcode in (OpCodes.Stfld, OpCodes.Stsfld): - if insn.operand.table == FIELD_TABLE: - # determine whether the operand is a field by checking if it belongs to the Field table - write_field: Optional[DnType] = get_fields(fh.ctx).get(insn.operand.value, None) - if write_field: - name = str(write_field) - access = FeatureAccess.WRITE + elif ih.inner.opcode in (OpCodes.Stfld, OpCodes.Stsfld): + # property write via Field + write_field: Optional[Union[DnType, DnUnmanagedMethod]] = fh.ctx["cache"].get_field(ih.inner.operand.value) + if write_field is not None: + name = str(write_field) + access = FeatureAccess.WRITE if name is not None: if access is not None: @@ -175,94 +116,78 @@ def extract_insn_property_features(fh: FunctionHandle, bh, ih: InsnHandle) -> It yield Property(name), ih.address -def extract_insn_class_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iterator[Tuple[Class, Address]]: - """parse instruction class features""" - if ih.inner.opcode not in ( +def extract_insn_namespace_class_features( + fh: FunctionHandle, bh, ih: InsnHandle +) -> Iterator[Tuple[Union[Namespace, Class], Address]]: + """parse instruction namespace and class features""" + type_: Optional[Union[DnType, DnUnmanagedMethod]] = None + + if ih.inner.opcode in ( OpCodes.Call, OpCodes.Callvirt, OpCodes.Jmp, - OpCodes.Calli, + OpCodes.Ldvirtftn, + OpCodes.Ldftn, + OpCodes.Newobj, + ): + # method call - includes managed methods (MethodDef, TypeRef) and properties (MethodSemantics, TypeRef) + type_ = get_callee(fh.ctx["pe"], fh.ctx["cache"], ih.inner.operand) + + elif ih.inner.opcode in ( OpCodes.Ldfld, OpCodes.Ldflda, OpCodes.Ldsfld, OpCodes.Ldsflda, OpCodes.Stfld, OpCodes.Stsfld, - OpCodes.Newobj, ): - return + # field access + type_ = fh.ctx["cache"].get_field(ih.inner.operand.value) - row: Any = resolve_dotnet_token(fh.ctx["pe"], ih.inner.operand) - if isinstance(row, dnfile.mdtable.MemberRefRow): - if isinstance(row.Class.row, (dnfile.mdtable.TypeRefRow, dnfile.mdtable.TypeDefRow)): - yield Class(DnType.format_name(row.Class.row.TypeName, namespace=row.Class.row.TypeNamespace)), ih.address - - elif isinstance(row, dnfile.mdtable.MethodDefRow): - callee: Union[DnType, DnUnmanagedMethod, None] = get_callee(fh.ctx, ih.inner.operand.value) - if isinstance(callee, DnType): - yield Class(DnType.format_name(callee.class_, namespace=callee.namespace)), ih.address - - elif isinstance(row, dnfile.mdtable.FieldRow): - field: Optional[DnType] = get_fields(fh.ctx).get(ih.inner.operand.value, None) - if field is not None: - yield Class(DnType.format_name(field.class_, namespace=field.namespace)), ih.address - - -def extract_insn_namespace_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iterator[Tuple[Namespace, Address]]: - """parse instruction namespace features""" - if ih.inner.opcode not in ( - OpCodes.Call, - OpCodes.Callvirt, - OpCodes.Jmp, - OpCodes.Calli, - OpCodes.Ldfld, - OpCodes.Ldflda, - OpCodes.Ldsfld, - OpCodes.Ldsflda, - OpCodes.Stfld, - OpCodes.Stsfld, - OpCodes.Newobj, + # ECMA 335 VI.C.4.10 + elif ih.inner.opcode in ( + OpCodes.Initobj, + OpCodes.Box, + OpCodes.Castclass, + OpCodes.Cpobj, + OpCodes.Isinst, + OpCodes.Ldelem, + OpCodes.Ldelema, + OpCodes.Ldobj, + OpCodes.Mkrefany, + OpCodes.Newarr, + OpCodes.Refanyval, + OpCodes.Sizeof, + OpCodes.Stobj, + OpCodes.Unbox, + OpCodes.Constrained, + OpCodes.Stelem, + OpCodes.Unbox_Any, ): - return + # type access + type_ = fh.ctx["cache"].get_type(ih.inner.operand.value) - row: Any = resolve_dotnet_token(fh.ctx["pe"], Token(ih.inner.operand.value)) - - if isinstance(row, dnfile.mdtable.MemberRefRow): - if isinstance(row.Class.row, (dnfile.mdtable.TypeRefRow, dnfile.mdtable.TypeDefRow)): - if row.Class.row.TypeNamespace: - yield Namespace(row.Class.row.TypeNamespace), ih.address - - elif isinstance(row, dnfile.mdtable.MethodDefRow): - callee: Union[DnType, DnUnmanagedMethod, None] = get_callee(fh.ctx, ih.inner.operand.value) - if isinstance(callee, DnType) and callee.namespace is not None: - yield Namespace(callee.namespace), ih.address - - elif isinstance(row, dnfile.mdtable.FieldRow): - field: Optional[DnType] = get_fields(fh.ctx).get(ih.inner.operand.value, None) - if field is not None: - yield Namespace(field.namespace), ih.address + if isinstance(type_, DnType): + yield Class(DnType.format_name(type_.class_, namespace=type_.namespace)), ih.address + if type_.namespace: + yield Namespace(type_.namespace), ih.address def extract_insn_number_features(fh, bh, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: """parse instruction number features""" - insn: Instruction = ih.inner - - if insn.is_ldc(): - yield Number(insn.get_ldc()), ih.address + if ih.inner.is_ldc(): + yield Number(ih.inner.get_ldc()), ih.address def extract_insn_string_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: """parse instruction string features""" - f: CilMethodBody = fh.inner - insn: Instruction = ih.inner - - if not insn.is_ldstr(): + if not ih.inner.is_ldstr(): return - if not isinstance(insn.operand, StringToken): + if not isinstance(ih.inner.operand, StringToken): return - user_string: Optional[str] = read_dotnet_user_string(fh.ctx["pe"], insn.operand) + user_string: Optional[str] = read_dotnet_user_string(fh.ctx["pe"], ih.inner.operand) if user_string is None: return @@ -272,17 +197,14 @@ def extract_insn_string_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iter def extract_unmanaged_call_characteristic_features( fh: FunctionHandle, bb: BBHandle, ih: InsnHandle ) -> Iterator[Tuple[Characteristic, Address]]: - insn: Instruction = ih.inner - if insn.opcode not in (OpCodes.Call, OpCodes.Callvirt, OpCodes.Jmp, OpCodes.Calli): + if ih.inner.opcode not in (OpCodes.Call, OpCodes.Callvirt, OpCodes.Jmp): return - token: Any = resolve_dotnet_token(fh.ctx["pe"], insn.operand) - if isinstance(token, InvalidToken): - return - if not isinstance(token, dnfile.mdtable.MethodDefRow): + row: Union[str, InvalidToken, dnfile.base.MDTableRow] = resolve_dotnet_token(fh.ctx["pe"], ih.inner.operand) + if not isinstance(row, dnfile.mdtable.MethodDefRow): return - if any((token.Flags.mdPinvokeImpl, token.ImplFlags.miUnmanaged, token.ImplFlags.miNative)): + if any((row.Flags.mdPinvokeImpl, row.ImplFlags.miUnmanaged, row.ImplFlags.miNative)): yield Characteristic("unmanaged call"), ih.address @@ -299,7 +221,6 @@ INSTRUCTION_HANDLERS = ( extract_insn_property_features, extract_insn_number_features, extract_insn_string_features, - extract_insn_namespace_features, - extract_insn_class_features, + extract_insn_namespace_class_features, extract_unmanaged_call_characteristic_features, ) diff --git a/capa/features/extractors/dnfile/types.py b/capa/features/extractors/dnfile/types.py new file mode 100644 index 00000000..822b5d67 --- /dev/null +++ b/capa/features/extractors/dnfile/types.py @@ -0,0 +1,75 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +from enum import Enum +from typing import Union, Optional + + +class DnType(object): + def __init__(self, token: int, class_: str, namespace: str = "", member: str = "", access: Optional[str] = None): + self.token: int = token + self.access: Optional[str] = access + self.namespace: str = namespace + self.class_: str = class_ + + if member == ".ctor": + member = "ctor" + if member == ".cctor": + member = "cctor" + + self.member: str = member + + def __hash__(self): + return hash((self.token, self.access, self.namespace, self.class_, self.member)) + + def __eq__(self, other): + return ( + self.token == other.token + and self.access == other.access + and self.namespace == other.namespace + and self.class_ == other.class_ + and self.member == other.member + ) + + def __str__(self): + return DnType.format_name(self.class_, namespace=self.namespace, member=self.member) + + def __repr__(self): + return str(self) + + @staticmethod + def format_name(class_: str, namespace: str = "", member: str = ""): + # like File::OpenRead + name: str = f"{class_}::{member}" if member else class_ + if namespace: + # like System.IO.File::OpenRead + name = f"{namespace}.{name}" + return name + + +class DnUnmanagedMethod: + def __init__(self, token: int, module: str, method: str): + self.token: int = token + self.module: str = module + self.method: str = method + + def __hash__(self): + return hash((self.token, self.module, self.method)) + + def __eq__(self, other): + return self.token == other.token and self.module == other.module and self.method == other.method + + def __str__(self): + return DnUnmanagedMethod.format_name(self.module, self.method) + + def __repr__(self): + return str(self) + + @staticmethod + def format_name(module, method): + return f"{module}.{method}" diff --git a/capa/features/extractors/dnfile_.py b/capa/features/extractors/dnfile_.py index 7a459bec..7286001b 100644 --- a/capa/features/extractors/dnfile_.py +++ b/capa/features/extractors/dnfile_.py @@ -19,9 +19,12 @@ def extract_file_os(**kwargs) -> Iterator[Tuple[Feature, Address]]: yield OS(OS_ANY), NO_ADDRESS -def extract_file_arch(pe, **kwargs) -> Iterator[Tuple[Feature, Address]]: +def extract_file_arch(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Feature, Address]]: # to distinguish in more detail, see https://stackoverflow.com/a/23614024/10548020 # .NET 4.5 added option: any CPU, 32-bit preferred + assert pe.net is not None + assert pe.net.Flags is not None + if pe.net.Flags.CLR_32BITREQUIRED and pe.PE_TYPE == pefile.OPTIONAL_HEADER_MAGIC_PE: yield Arch(ARCH_I386), NO_ADDRESS elif not pe.net.Flags.CLR_32BITREQUIRED and pe.PE_TYPE == pefile.OPTIONAL_HEADER_MAGIC_PE_PLUS: @@ -60,7 +63,7 @@ GLOBAL_HANDLERS = ( class DnfileFeatureExtractor(FeatureExtractor): def __init__(self, path: str): - super(DnfileFeatureExtractor, self).__init__() + super().__init__() self.path: str = path self.pe: dnfile.dnPE = dnfile.dnPE(path) @@ -71,6 +74,9 @@ class DnfileFeatureExtractor(FeatureExtractor): # self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT # True: native EP: Token # False: managed EP: RVA + assert self.pe.net is not None + assert self.pe.net.struct is not None + return self.pe.net.struct.EntryPointTokenOrRva def extract_global_features(self): @@ -83,13 +89,29 @@ class DnfileFeatureExtractor(FeatureExtractor): return bool(self.pe.net) def is_mixed_mode(self) -> bool: + assert self.pe is not None + assert self.pe.net is not None + assert self.pe.net.Flags is not None + return not bool(self.pe.net.Flags.CLR_ILONLY) def get_runtime_version(self) -> Tuple[int, int]: + assert self.pe is not None + assert self.pe.net is not None + assert self.pe.net.struct is not None + return self.pe.net.struct.MajorRuntimeVersion, self.pe.net.struct.MinorRuntimeVersion def get_meta_version_string(self) -> str: - return self.pe.net.metadata.struct.Version.rstrip(b"\x00").decode("utf-8") + assert self.pe.net is not None + assert self.pe.net.metadata is not None + assert self.pe.net.metadata.struct is not None + assert self.pe.net.metadata.struct.Version is not None + + vbuf = self.pe.net.metadata.struct.Version + assert isinstance(vbuf, bytes) + + return vbuf.rstrip(b"\x00").decode("utf-8") def get_functions(self): raise NotImplementedError("DnfileFeatureExtractor can only be used to extract file features") diff --git a/capa/features/extractors/dotnetfile.py b/capa/features/extractors/dotnetfile.py index b9c9f00a..1b5aa1f3 100644 --- a/capa/features/extractors/dotnetfile.py +++ b/capa/features/extractors/dotnetfile.py @@ -1,5 +1,5 @@ import logging -from typing import Tuple, Iterator +from typing import Tuple, Iterator, cast import dnfile import pefile @@ -62,11 +62,15 @@ def extract_file_namespace_features(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple # namespaces may be referenced multiple times, so we need to filter namespaces = set() - for row in iter_dotnet_table(pe, "TypeDef"): - namespaces.add(row.TypeNamespace) + for (_, typedef) in iter_dotnet_table(pe, dnfile.mdtable.TypeDef.number): + # emit internal .NET namespaces + assert isinstance(typedef, dnfile.mdtable.TypeDefRow) + namespaces.add(typedef.TypeNamespace) - for row in iter_dotnet_table(pe, "TypeRef"): - namespaces.add(row.TypeNamespace) + for (_, typeref) in iter_dotnet_table(pe, dnfile.mdtable.TypeRef.number): + # emit external .NET namespaces + assert isinstance(typeref, dnfile.mdtable.TypeRefRow) + namespaces.add(typeref.TypeNamespace) # namespaces may be empty, discard namespaces.discard("") @@ -78,13 +82,19 @@ def extract_file_namespace_features(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple def extract_file_class_features(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Class, Address]]: """emit class features from TypeRef and TypeDef tables""" - for (rid, row) in enumerate(iter_dotnet_table(pe, "TypeDef")): - token = calculate_dotnet_token_value(pe.net.mdtables.TypeDef.number, rid + 1) - yield Class(DnType.format_name(row.TypeName, namespace=row.TypeNamespace)), DNTokenAddress(token) + for (rid, typedef) in iter_dotnet_table(pe, dnfile.mdtable.TypeDef.number): + # emit internal .NET classes + assert isinstance(typedef, dnfile.mdtable.TypeDefRow) - for (rid, row) in enumerate(iter_dotnet_table(pe, "TypeRef")): - token = calculate_dotnet_token_value(pe.net.mdtables.TypeRef.number, rid + 1) - yield Class(DnType.format_name(row.TypeName, namespace=row.TypeNamespace)), DNTokenAddress(token) + token = calculate_dotnet_token_value(dnfile.mdtable.TypeDef.number, rid) + yield Class(DnType.format_name(typedef.TypeName, namespace=typedef.TypeNamespace)), DNTokenAddress(token) + + for (rid, typeref) in iter_dotnet_table(pe, dnfile.mdtable.TypeRef.number): + # emit external .NET classes + assert isinstance(typeref, dnfile.mdtable.TypeRefRow) + + token = calculate_dotnet_token_value(dnfile.mdtable.TypeRef.number, rid) + yield Class(DnType.format_name(typeref.TypeName, namespace=typeref.TypeNamespace)), DNTokenAddress(token) def extract_file_os(**kwargs) -> Iterator[Tuple[OS, Address]]: @@ -94,6 +104,9 @@ def extract_file_os(**kwargs) -> Iterator[Tuple[OS, Address]]: def extract_file_arch(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Arch, Address]]: # to distinguish in more detail, see https://stackoverflow.com/a/23614024/10548020 # .NET 4.5 added option: any CPU, 32-bit preferred + assert pe.net is not None + assert pe.net.Flags is not None + if pe.net.Flags.CLR_32BITREQUIRED and pe.PE_TYPE == pefile.OPTIONAL_HEADER_MAGIC_PE: yield Arch(ARCH_I386), NO_ADDRESS elif not pe.net.Flags.CLR_32BITREQUIRED and pe.PE_TYPE == pefile.OPTIONAL_HEADER_MAGIC_PE_PLUS: @@ -144,7 +157,7 @@ GLOBAL_HANDLERS = ( class DotnetFileFeatureExtractor(FeatureExtractor): def __init__(self, path: str): - super(DotnetFileFeatureExtractor, self).__init__() + super().__init__() self.path: str = path self.pe: dnfile.dnPE = dnfile.dnPE(path) @@ -155,6 +168,9 @@ class DotnetFileFeatureExtractor(FeatureExtractor): # self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT # True: native EP: Token # False: managed EP: RVA + assert self.pe.net is not None + assert self.pe.net.struct is not None + return self.pe.net.struct.EntryPointTokenOrRva def extract_global_features(self): @@ -170,10 +186,23 @@ class DotnetFileFeatureExtractor(FeatureExtractor): return is_dotnet_mixed_mode(self.pe) def get_runtime_version(self) -> Tuple[int, int]: + assert self.pe.net is not None + assert self.pe.net.struct is not None + assert self.pe.net.struct.MajorRuntimeVersion is not None + assert self.pe.net.struct.MinorRuntimeVersion is not None + return self.pe.net.struct.MajorRuntimeVersion, self.pe.net.struct.MinorRuntimeVersion def get_meta_version_string(self) -> str: - return self.pe.net.metadata.struct.Version.rstrip(b"\x00").decode("utf-8") + assert self.pe.net is not None + assert self.pe.net.metadata is not None + assert self.pe.net.metadata.struct is not None + assert self.pe.net.metadata.struct.Version is not None + + vbuf = self.pe.net.metadata.struct.Version + assert isinstance(vbuf, bytes) + + return vbuf.rstrip(b"\x00").decode("utf-8") def get_functions(self): raise NotImplementedError("DotnetFileFeatureExtractor can only be used to extract file features") diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py index 9f4f9b34..6c88d46e 100644 --- a/capa/features/extractors/elf.py +++ b/capa/features/extractors/elf.py @@ -7,8 +7,11 @@ # See the License for the specific language governing permissions and limitations under the License. import struct import logging +import itertools +import collections from enum import Enum -from typing import BinaryIO +from typing import Set, Dict, List, Tuple, BinaryIO, Iterator, Optional +from dataclasses import dataclass logger = logging.getLogger(__name__) @@ -21,6 +24,12 @@ def align(v, alignment): return v + (alignment - remainder) +def read_cstr(buf, offset): + s = buf[offset:] + s, _, _ = s.partition(b"\x00") + return s.decode("utf-8") + + class CorruptElfFile(ValueError): pass @@ -60,52 +69,95 @@ GNU_ABI_TAG = { } -def detect_elf_os(f) -> str: - """ - f: type Union[BinaryIO, IDAIO] - """ - f.seek(0x0) - file_header = f.read(0x40) +@dataclass +class Phdr: + type: int + offset: int + vaddr: int + paddr: int + filesz: int + buf: bytes - # we'll set this to the detected OS - # prefer the first heuristics, - # but rather than short circuiting, - # we'll still parse out the remainder, for debugging. - ret = None - if not file_header.startswith(b"\x7fELF"): - raise CorruptElfFile("missing magic header") +@dataclass +class Shdr: + name: int + type: int + flags: int + addr: int + offset: int + size: int + link: int + buf: bytes - ei_class, ei_data = struct.unpack_from("BB", file_header, 4) - logger.debug("ei_class: 0x%02x ei_data: 0x%02x", ei_class, ei_data) - if ei_class == 1: - bitness = 32 - elif ei_class == 2: - bitness = 64 - else: - raise CorruptElfFile("invalid ei_class: 0x%02x" % ei_class) - if ei_data == 1: - endian = "<" - elif ei_data == 2: - endian = ">" - else: - raise CorruptElfFile("not an ELF file: invalid ei_data: 0x%02x" % ei_data) +class ELF: + def __init__(self, f: BinaryIO): + self.f = f - if bitness == 32: - (e_phoff, e_shoff) = struct.unpack_from(endian + "II", file_header, 0x1C) - e_phentsize, e_phnum = struct.unpack_from(endian + "HH", file_header, 0x2A) - e_shentsize, e_shnum = struct.unpack_from(endian + "HH", file_header, 0x2E) - elif bitness == 64: - (e_phoff, e_shoff) = struct.unpack_from(endian + "QQ", file_header, 0x20) - e_phentsize, e_phnum = struct.unpack_from(endian + "HH", file_header, 0x36) - e_shentsize, e_shnum = struct.unpack_from(endian + "HH", file_header, 0x3A) - else: - raise NotImplementedError() + # these will all be initialized in `_parse()` + self.bitness: int + self.endian: str + self.e_phentsize: int + self.e_phnum: int + self.e_shentsize: int + self.e_shnum: int + self.phbuf: bytes + self.shbuf: bytes - logger.debug("e_phoff: 0x%02x e_phentsize: 0x%02x e_phnum: %d", e_phoff, e_phentsize, e_phnum) + self._parse() + + def _parse(self): + + self.f.seek(0x0) + self.file_header = self.f.read(0x40) + + if not self.file_header.startswith(b"\x7fELF"): + raise CorruptElfFile("missing magic header") + + ei_class, ei_data = struct.unpack_from("BB", self.file_header, 4) + logger.debug("ei_class: 0x%02x ei_data: 0x%02x", ei_class, ei_data) + if ei_class == 1: + self.bitness = 32 + elif ei_class == 2: + self.bitness = 64 + else: + raise CorruptElfFile("invalid ei_class: 0x%02x" % ei_class) + + if ei_data == 1: + self.endian = "<" + elif ei_data == 2: + self.endian = ">" + else: + raise CorruptElfFile("not an ELF file: invalid ei_data: 0x%02x" % ei_data) + + if self.bitness == 32: + e_phoff, e_shoff = struct.unpack_from(self.endian + "II", self.file_header, 0x1C) + self.e_phentsize, self.e_phnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x2A) + self.e_shentsize, self.e_shnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x2E) + elif self.bitness == 64: + e_phoff, e_shoff = struct.unpack_from(self.endian + "QQ", self.file_header, 0x20) + self.e_phentsize, self.e_phnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x36) + self.e_shentsize, self.e_shnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x3A) + else: + raise NotImplementedError() + + logger.debug("e_phoff: 0x%02x e_phentsize: 0x%02x e_phnum: %d", e_phoff, self.e_phentsize, self.e_phnum) + + self.f.seek(e_phoff) + program_header_size = self.e_phnum * self.e_phentsize + self.phbuf = self.f.read(program_header_size) + if len(self.phbuf) != program_header_size: + logger.warning("failed to read program headers") + self.e_phnum = 0 + + self.f.seek(e_shoff) + section_header_size = self.e_shnum * self.e_shentsize + self.shbuf = self.f.read(section_header_size) + if len(self.shbuf) != section_header_size: + logger.warning("failed to read section headers") + self.e_shnum = 0 - (ei_osabi,) = struct.unpack_from(endian + "B", file_header, 7) OSABI = { # via pyelftools: https://github.com/eliben/pyelftools/blob/0664de05ed2db3d39041e2d51d19622a8ef4fb0f/elftools/elf/enums.py#L35-L58 # some candidates are commented out because the are not useful values, @@ -133,218 +185,598 @@ def detect_elf_os(f) -> str: # 97: "ARM", # not an OS # 255: "STANDALONE", # not an OS } - logger.debug("ei_osabi: 0x%02x (%s)", ei_osabi, OSABI.get(ei_osabi, "unknown")) - # os_osabi == 0 is commonly set even when the OS is not SYSV. - # other values are unused or unknown. - if ei_osabi in OSABI and ei_osabi != 0x0: - # subsequent strategies may overwrite this value - ret = OSABI[ei_osabi] + @property + def ei_osabi(self) -> Optional[OS]: + (ei_osabi,) = struct.unpack_from(self.endian + "B", self.file_header, 7) + return ELF.OSABI.get(ei_osabi) - f.seek(e_phoff) - program_header_size = e_phnum * e_phentsize - program_headers = f.read(program_header_size) - if len(program_headers) != program_header_size: - logger.warning("failed to read program headers") - e_phnum = 0 + MACHINE = { + # via https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html + 1: "M32", + 2: "SPARC", + 3: "i386", + 4: "68K", + 5: "88K", + 6: "486", + 7: "860", + 8: "MIPS", + 9: "S370", + 10: "MIPS_RS3_LE", + 11: "RS6000", + 15: "PA_RISC", + 16: "nCUBE", + 17: "VPP500", + 18: "SPARC32PLUS", + 19: "960", + 20: "PPC", + 21: "PPC64", + 22: "S390", + 23: "SPU", + 36: "V800", + 37: "FR20", + 38: "RH32", + 39: "RCE", + 40: "ARM", + 41: "ALPHA", + 42: "SH", + 43: "SPARCV9", + 44: "TRICORE", + 45: "ARC", + 46: "H8_300", + 47: "H8_300H", + 48: "H8S", + 49: "H8_500", + 50: "IA_64", + 51: "MIPS_X", + 52: "COLDFIRE", + 53: "68HC12", + 54: "MMA", + 55: "PCP", + 56: "NCPU", + 57: "NDR1", + 58: "STARCORE", + 59: "ME16", + 60: "ST100", + 61: "TINYJ", + 62: "amd64", + 63: "PDSP", + 64: "PDP10", + 65: "PDP11", + 66: "FX66", + 67: "ST9PLUS", + 68: "ST7", + 69: "68HC16", + 70: "68HC11", + 71: "68HC08", + 72: "68HC05", + 73: "SVX", + 74: "ST19", + 75: "VAX", + 76: "CRIS", + 77: "JAVELIN", + 78: "FIREPATH", + 79: "ZSP", + 80: "MMIX", + 81: "HUANY", + 82: "PRISM", + 83: "AVR", + 84: "FR30", + 85: "D10V", + 86: "D30V", + 87: "V850", + 88: "M32R", + 89: "MN10300", + 90: "MN10200", + 91: "PJ", + 92: "OPENRISC", + 93: "ARC_A5", + 94: "XTENSA", + 95: "VIDEOCORE", + 96: "TMM_GPP", + 97: "NS32K", + 98: "TPC", + 99: "SNP1K", + 100: "ST200", + } - # search for PT_NOTE sections that specify an OS - # for example, on Linux there is a GNU section with minimum kernel version - for i in range(e_phnum): - offset = i * e_phentsize - phent = program_headers[offset : offset + e_phentsize] + @property + def e_machine(self) -> Optional[str]: + (e_machine,) = struct.unpack_from(self.endian + "H", self.file_header, 0x12) + return ELF.MACHINE.get(e_machine) - PT_NOTE = 0x4 + def parse_program_header(self, i) -> Phdr: + phent_offset = i * self.e_phentsize + phent = self.phbuf[phent_offset : phent_offset + self.e_phentsize] - (p_type,) = struct.unpack_from(endian + "I", phent, 0x0) + (p_type,) = struct.unpack_from(self.endian + "I", phent, 0x0) logger.debug("ph:p_type: 0x%04x", p_type) - if p_type != PT_NOTE: - continue - if bitness == 32: - p_offset, _, _, p_filesz = struct.unpack_from(endian + "IIII", phent, 0x4) - elif bitness == 64: - p_offset, _, _, p_filesz = struct.unpack_from(endian + "QQQQ", phent, 0x8) + if self.bitness == 32: + p_offset, p_vaddr, p_paddr, p_filesz = struct.unpack_from(self.endian + "IIII", phent, 0x4) + elif self.bitness == 64: + p_offset, p_vaddr, p_paddr, p_filesz = struct.unpack_from(self.endian + "QQQQ", phent, 0x8) else: raise NotImplementedError() logger.debug("ph:p_offset: 0x%02x p_filesz: 0x%04x", p_offset, p_filesz) - f.seek(p_offset) - note = f.read(p_filesz) - if len(note) != p_filesz: - logger.warning("failed to read note content") - continue + self.f.seek(p_offset) + buf = self.f.read(p_filesz) + if len(buf) != p_filesz: + raise ValueError("failed to read program header content") - namesz, descsz, type_ = struct.unpack_from(endian + "III", note, 0x0) - name_offset = 0xC - desc_offset = name_offset + align(namesz, 0x4) + return Phdr(p_type, p_offset, p_vaddr, p_paddr, p_filesz, buf) - logger.debug("ph:namesz: 0x%02x descsz: 0x%02x type: 0x%04x", namesz, descsz, type_) - - name = note[name_offset : name_offset + namesz].partition(b"\x00")[0].decode("ascii") - logger.debug("name: %s", name) - - if type_ != 1: - continue - - if name == "GNU": - if descsz < 16: + @property + def program_headers(self): + for i in range(self.e_phnum): + try: + yield self.parse_program_header(i) + except ValueError: continue - desc = note[desc_offset : desc_offset + descsz] - abi_tag, kmajor, kminor, kpatch = struct.unpack_from(endian + "IIII", desc, 0x0) - logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag) + def parse_section_header(self, i) -> Shdr: + shent_offset = i * self.e_shentsize + shent = self.shbuf[shent_offset : shent_offset + self.e_shentsize] - if abi_tag in GNU_ABI_TAG: - # update only if not set - # so we can get the debugging output of subsequent strategies - ret = GNU_ABI_TAG[abi_tag] if not ret else ret - logger.debug("abi tag: %s earliest compatible kernel: %d.%d.%d", ret, kmajor, kminor, kpatch) - elif name == "OpenBSD": - logger.debug("note owner: %s", "OPENBSD") - ret = OS.OPENBSD if not ret else ret - elif name == "NetBSD": - logger.debug("note owner: %s", "NETBSD") - ret = OS.NETBSD if not ret else ret - elif name == "FreeBSD": - logger.debug("note owner: %s", "FREEBSD") - ret = OS.FREEBSD if not ret else ret - - # search for recognizable dynamic linkers (interpreters) - # for example, on linux, we see file paths like: /lib64/ld-linux-x86-64.so.2 - for i in range(e_phnum): - offset = i * e_phentsize - phent = program_headers[offset : offset + e_phentsize] - - PT_INTERP = 0x3 - - (p_type,) = struct.unpack_from(endian + "I", phent, 0x0) - if p_type != PT_INTERP: - continue - - if bitness == 32: - p_offset, _, _, p_filesz = struct.unpack_from(endian + "IIII", phent, 0x4) - elif bitness == 64: - p_offset, _, _, p_filesz = struct.unpack_from(endian + "QQQQ", phent, 0x8) + if self.bitness == 32: + sh_name, sh_type, sh_flags, sh_addr, sh_offset, sh_size, sh_link = struct.unpack_from( + self.endian + "IIIIIII", shent, 0x0 + ) + elif self.bitness == 64: + sh_name, sh_type, sh_flags, sh_addr, sh_offset, sh_size, sh_link = struct.unpack_from( + self.endian + "IIQQQQI", shent, 0x0 + ) else: raise NotImplementedError() - f.seek(p_offset) - interp = f.read(p_filesz) - if len(interp) != p_filesz: - logger.warning("failed to read interp content") - continue - - linker = interp.partition(b"\x00")[0].decode("ascii") - logger.debug("linker: %s", linker) - if "ld-linux" in linker: - # update only if not set - # so we can get the debugging output of subsequent strategies - ret = OS.LINUX if ret is None else ret - - f.seek(e_shoff) - section_header_size = e_shnum * e_shentsize - section_headers = f.read(section_header_size) - if len(section_headers) != section_header_size: - logger.warning("failed to read section headers") - e_shnum = 0 - - # search for notes stored in sections that aren't visible in program headers. - # e.g. .note.Linux in Linux kernel modules. - for i in range(e_shnum): - offset = i * e_shentsize - shent = section_headers[offset : offset + e_shentsize] - - if bitness == 32: - sh_name, sh_type, _, sh_addr, sh_offset, sh_size = struct.unpack_from(endian + "IIIIII", shent, 0x0) - elif bitness == 64: - sh_name, sh_type, _, sh_addr, sh_offset, sh_size = struct.unpack_from(endian + "IIQQQQ", shent, 0x0) - else: - raise NotImplementedError() - - SHT_NOTE = 0x7 - if sh_type != SHT_NOTE: - continue - logger.debug("sh:sh_offset: 0x%02x sh_size: 0x%04x", sh_offset, sh_size) - f.seek(sh_offset) - note = f.read(sh_size) - if len(note) != sh_size: - logger.warning("failed to read note content") - continue + self.f.seek(sh_offset) + buf = self.f.read(sh_size) + if len(buf) != sh_size: + raise ValueError("failed to read section header content") - namesz, descsz, type_ = struct.unpack_from(endian + "III", note, 0x0) - name_offset = 0xC - desc_offset = name_offset + align(namesz, 0x4) + return Shdr(sh_name, sh_type, sh_flags, sh_addr, sh_offset, sh_size, sh_link, buf) - logger.debug("sh:namesz: 0x%02x descsz: 0x%02x type: 0x%04x", namesz, descsz, type_) - - name = note[name_offset : name_offset + namesz].partition(b"\x00")[0].decode("ascii") - logger.debug("name: %s", name) - - if name == "Linux": - logger.debug("note owner: %s", "LINUX") - ret = OS.LINUX if not ret else ret - elif name == "OpenBSD": - logger.debug("note owner: %s", "OPENBSD") - ret = OS.OPENBSD if not ret else ret - elif name == "NetBSD": - logger.debug("note owner: %s", "NETBSD") - ret = OS.NETBSD if not ret else ret - elif name == "FreeBSD": - logger.debug("note owner: %s", "FREEBSD") - ret = OS.FREEBSD if not ret else ret - elif name == "GNU": - if descsz < 16: + @property + def section_headers(self): + for i in range(self.e_shnum): + try: + yield self.parse_section_header(i) + except ValueError: continue - desc = note[desc_offset : desc_offset + descsz] - abi_tag, kmajor, kminor, kpatch = struct.unpack_from(endian + "IIII", desc, 0x0) - logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag) + @property + def linker(self): + PT_INTERP = 0x3 + for phdr in self.program_headers: + if phdr.type != PT_INTERP: + continue - if abi_tag in GNU_ABI_TAG: - # update only if not set - # so we can get the debugging output of subsequent strategies - ret = GNU_ABI_TAG[abi_tag] if not ret else ret - logger.debug("abi tag: %s earliest compatible kernel: %d.%d.%d", ret, kmajor, kminor, kpatch) + return read_cstr(phdr.buf, 0) + + @property + def versions_needed(self) -> Dict[str, Set[str]]: + # symbol version requirements are stored in the .gnu.version_r section, + # which has type SHT_GNU_verneed (0x6ffffffe). + # + # this contains a linked list of ElfXX_Verneed structs, + # each referencing a linked list of ElfXX_Vernaux structs. + # strings are stored in the section referenced by the sh_link field of the section header. + # each Verneed struct contains a reference to the name of the library, + # each Vernaux struct contains a reference to the name of a symbol. + SHT_GNU_VERNEED = 0x6FFFFFFE + for shdr in self.section_headers: + if shdr.type != SHT_GNU_VERNEED: + continue + + # the linked section contains strings referenced by the verneed structures. + linked_shdr = self.parse_section_header(shdr.link) + + versions_needed = collections.defaultdict(set) + + # read verneed structures from the start of the section + # until the vn_next link is 0x0. + # each entry describes a shared object that is required by this binary. + vn_offset = 0x0 + while True: + # ElfXX_Verneed layout is the same on 32 and 64 bit + vn_version, vn_cnt, vn_file, vn_aux, vn_next = struct.unpack_from( + self.endian + "HHIII", shdr.buf, vn_offset + ) + if vn_version != 1: + # unexpected format, don't try to keep parsing + break + + # shared object names, like: "libdl.so.2" + so_name = read_cstr(linked_shdr.buf, vn_file) + + # read vernaux structures linked from the verneed structure. + # there should be vn_cnt of these. + # each entry describes an ABI name required by the shared object. + vna_offset = vn_offset + vn_aux + for i in range(vn_cnt): + # ElfXX_Vernaux layout is the same on 32 and 64 bit + _, _, _, vna_name, vna_next = struct.unpack_from(self.endian + "IHHII", shdr.buf, vna_offset) + + # ABI names, like: "GLIBC_2.2.5" + abi = read_cstr(linked_shdr.buf, vna_name) + versions_needed[so_name].add(abi) + + vna_offset += vna_next + + vn_offset += vn_next + if vn_next == 0: + break + + return dict(versions_needed) + + return {} + + @property + def dynamic_entries(self) -> Iterator[Tuple[int, int]]: + """ + read the entries from the dynamic section, + yielding the tag and value for each entry. + """ + DT_NULL = 0x0 + PT_DYNAMIC = 0x2 + for phdr in self.program_headers: + if phdr.type != PT_DYNAMIC: + continue + + offset = 0x0 + while True: + if self.bitness == 32: + d_tag, d_val = struct.unpack_from(self.endian + "II", phdr.buf, offset) + offset += 8 + elif self.bitness == 64: + d_tag, d_val = struct.unpack_from(self.endian + "QQ", phdr.buf, offset) + offset += 16 + else: + raise NotImplementedError() + + if d_tag == DT_NULL: + break + + yield d_tag, d_val + + @property + def strtab(self) -> Optional[bytes]: + """ + fetch the bytes of the string table + referenced by the dynamic section. + """ + DT_STRTAB = 0x5 + DT_STRSZ = 0xA + + strtab_addr = None + strtab_size = None + + for d_tag, d_val in self.dynamic_entries: + if d_tag == DT_STRTAB: + strtab_addr = d_val + + for d_tag, d_val in self.dynamic_entries: + if d_tag == DT_STRSZ: + strtab_size = d_val + + if strtab_addr is None: + return None + + if strtab_size is None: + return None + + strtab_offset = None + for shdr in self.section_headers: + if shdr.addr <= strtab_addr < shdr.addr + shdr.size: + strtab_offset = shdr.offset + (strtab_addr - shdr.addr) + + if strtab_offset is None: + return None + + self.f.seek(strtab_offset) + strtab_buf = self.f.read(strtab_size) + + if len(strtab_buf) != strtab_size: + return None + + return strtab_buf + + @property + def needed(self) -> Iterator[str]: + """ + read the names of DT_NEEDED entries from the dynamic section, + which correspond to dependencies on other shared objects, + like: `libpthread.so.0` + """ + DT_NEEDED = 0x1 + strtab = self.strtab + if not strtab: + return + + for d_tag, d_val in self.dynamic_entries: + if d_tag != DT_NEEDED: + continue + + yield read_cstr(strtab, d_val) + + +@dataclass +class ABITag: + os: OS + kmajor: int + kminor: int + kpatch: int + + +class PHNote: + def __init__(self, endian: str, buf: bytes): + self.endian = endian + self.buf = buf + + # these will be initialized in `_parse()` + self.type_: int + self.descsz: int + self.name: str + + self._parse() + + def _parse(self): + namesz, self.descsz, self.type_ = struct.unpack_from(self.endian + "III", self.buf, 0x0) + name_offset = 0xC + self.desc_offset = name_offset + align(namesz, 0x4) + + logger.debug("ph:namesz: 0x%02x descsz: 0x%02x type: 0x%04x", namesz, self.descsz, self.type_) + + self.name = self.buf[name_offset : name_offset + namesz].partition(b"\x00")[0].decode("ascii") + logger.debug("name: %s", self.name) + + @property + def abi_tag(self) -> Optional[ABITag]: + if self.type_ != 1: + # > The type field shall be 1. + # Linux Standard Base Specification 1.2 + # ref: https://refspecs.linuxfoundation.org/LSB_1.2.0/gLSB/noteabitag.html + return None + + if self.name != "GNU": + return None + + if self.descsz < 16: + return None + + desc = self.buf[self.desc_offset : self.desc_offset + self.descsz] + abi_tag, kmajor, kminor, kpatch = struct.unpack_from(self.endian + "IIII", desc, 0x0) + logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag) + + os = GNU_ABI_TAG.get(abi_tag) + if not os: + return None + + logger.debug("abi tag: %s earliest compatible kernel: %d.%d.%d", os, kmajor, kminor, kpatch) + + return ABITag(os, kmajor, kminor, kpatch) + + +class SHNote: + def __init__(self, endian: str, buf: bytes): + self.endian = endian + self.buf = buf + + # these will be initialized in `_parse()` + self.type_: int + self.descsz: int + self.name: str + + self._parse() + + def _parse(self): + namesz, self.descsz, self.type_ = struct.unpack_from(self.endian + "III", self.buf, 0x0) + name_offset = 0xC + self.desc_offset = name_offset + align(namesz, 0x4) + + logger.debug("sh:namesz: 0x%02x descsz: 0x%02x type: 0x%04x", namesz, self.descsz, self.type_) + + name_buf = self.buf[name_offset : name_offset + namesz] + self.name = read_cstr(name_buf, 0x0) + logger.debug("sh:name: %s", self.name) + + @property + def abi_tag(self) -> Optional[ABITag]: + if self.name != "GNU": + return None + + if self.descsz < 16: + return None + + desc = self.buf[self.desc_offset : self.desc_offset + self.descsz] + abi_tag, kmajor, kminor, kpatch = struct.unpack_from(self.endian + "IIII", desc, 0x0) + logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag) + + os = GNU_ABI_TAG.get(abi_tag) + if not os: + return None + + logger.debug("abi tag: %s earliest compatible kernel: %d.%d.%d", os, kmajor, kminor, kpatch) + return ABITag(os, kmajor, kminor, kpatch) + + +def guess_os_from_osabi(elf) -> Optional[OS]: + return elf.ei_osabi + + +def guess_os_from_ph_notes(elf) -> Optional[OS]: + # search for PT_NOTE sections that specify an OS + # for example, on Linux there is a GNU section with minimum kernel version + PT_NOTE = 0x4 + for phdr in elf.program_headers: + if phdr.type != PT_NOTE: + continue + + note = PHNote(elf.endian, phdr.buf) + + if note.type_ != 1: + # > The type field shall be 1. + # Linux Standard Base Specification 1.2 + # ref: https://refspecs.linuxfoundation.org/LSB_1.2.0/gLSB/noteabitag.html + continue + + if note.name == "Linux": + logger.debug("note owner: %s", "LINUX") + return OS.LINUX + elif note.name == "OpenBSD": + logger.debug("note owner: %s", "OPENBSD") + return OS.OPENBSD + elif note.name == "NetBSD": + logger.debug("note owner: %s", "NETBSD") + return OS.NETBSD + elif note.name == "FreeBSD": + logger.debug("note owner: %s", "FREEBSD") + return OS.FREEBSD + elif note.name == "GNU": + abi_tag = note.abi_tag + if abi_tag: + return abi_tag.os + else: + # cannot make a guess about the OS, but probably linux or hurd + pass + + return None + + +def guess_os_from_sh_notes(elf) -> Optional[OS]: + # search for notes stored in sections that aren't visible in program headers. + # e.g. .note.Linux in Linux kernel modules. + SHT_NOTE = 0x7 + for shdr in elf.section_headers: + if shdr.type != SHT_NOTE: + continue + + note = SHNote(elf.endian, shdr.buf) + + if note.name == "Linux": + logger.debug("note owner: %s", "LINUX") + return OS.LINUX + elif note.name == "OpenBSD": + logger.debug("note owner: %s", "OPENBSD") + return OS.OPENBSD + elif note.name == "NetBSD": + logger.debug("note owner: %s", "NETBSD") + return OS.NETBSD + elif note.name == "FreeBSD": + logger.debug("note owner: %s", "FREEBSD") + return OS.FREEBSD + elif note.name == "GNU": + abi_tag = note.abi_tag + if abi_tag: + return abi_tag.os + else: + # cannot make a guess about the OS, but probably linux or hurd + pass + + return None + + +def guess_os_from_linker(elf) -> Optional[OS]: + # search for recognizable dynamic linkers (interpreters) + # for example, on linux, we see file paths like: /lib64/ld-linux-x86-64.so.2 + linker = elf.linker + if linker and "ld-linux" in elf.linker: + return OS.LINUX + + return None + + +def guess_os_from_abi_versions_needed(elf) -> Optional[OS]: + # then lets look for GLIBC symbol versioning requirements. + # this will let us guess about linux/hurd in some cases. + + versions_needed = elf.versions_needed + if any(map(lambda abi: abi.startswith("GLIBC"), itertools.chain(*versions_needed.values()))): + # there are any GLIBC versions needed + + if elf.e_machine != "i386": + # GLIBC runs on Linux and Hurd. + # for Hurd, its *only* on i386. + # so if we're not on i386, then we're on Linux. + return OS.LINUX + + else: + # we're on i386, so we could be on either Linux or Hurd. + linker = elf.linker + + if linker and "ld-linux" in linker: + return OS.LINUX + + elif linker and "/ld.so" in linker: + return OS.HURD + + else: + # we don't have any good guesses based on versions needed + pass + + return None + + +def guess_os_from_needed_dependencies(elf) -> Optional[OS]: + for needed in elf.needed: + if needed.startswith("libmachuser.so"): + return OS.HURD + if needed.startswith("libhurduser.so"): + return OS.HURD + + return None + + +def detect_elf_os(f) -> str: + """ + f: type Union[BinaryIO, IDAIO] + """ + elf = ELF(f) + + osabi_guess = guess_os_from_osabi(elf) + logger.debug("guess: osabi: %s", osabi_guess) + + ph_notes_guess = guess_os_from_ph_notes(elf) + logger.debug("guess: ph notes: %s", ph_notes_guess) + + sh_notes_guess = guess_os_from_sh_notes(elf) + logger.debug("guess: sh notes: %s", sh_notes_guess) + + linker_guess = guess_os_from_linker(elf) + logger.debug("guess: linker: %s", linker_guess) + + abi_versions_needed_guess = guess_os_from_abi_versions_needed(elf) + logger.debug("guess: ABI versions needed: %s", abi_versions_needed_guess) + + needed_dependencies_guess = guess_os_from_needed_dependencies(elf) + logger.debug("guess: needed dependencies: %s", needed_dependencies_guess) + + ret = None + + if osabi_guess: + ret = osabi_guess + + elif ph_notes_guess: + ret = ph_notes_guess + + elif sh_notes_guess: + ret = sh_notes_guess + + elif linker_guess: + ret = linker_guess + + elif abi_versions_needed_guess: + ret = abi_versions_needed_guess + + elif needed_dependencies_guess: + ret = needed_dependencies_guess return ret.value if ret is not None else "unknown" -class Arch(str, Enum): - I386 = "i386" - AMD64 = "amd64" - - def detect_elf_arch(f: BinaryIO) -> str: - f.seek(0x0) - file_header = f.read(0x40) - - if not file_header.startswith(b"\x7fELF"): - raise CorruptElfFile("missing magic header") - - (ei_data,) = struct.unpack_from("B", file_header, 5) - logger.debug("ei_data: 0x%02x", ei_data) - - if ei_data == 1: - endian = "<" - elif ei_data == 2: - endian = ">" - else: - raise CorruptElfFile("not an ELF file: invalid ei_data: 0x%02x" % ei_data) - - (ei_machine,) = struct.unpack_from(endian + "H", file_header, 0x12) - logger.debug("ei_machine: 0x%02x", ei_machine) - - EM_386 = 0x3 - EM_X86_64 = 0x3E - if ei_machine == EM_386: - return Arch.I386 - elif ei_machine == EM_X86_64: - return Arch.AMD64 - else: - # not really unknown, but unsupport at the moment: - # https://github.com/eliben/pyelftools/blob/ab444d982d1849191e910299a985989857466620/elftools/elf/enums.py#L73 - return "unknown" + return ELF(f).e_machine or "unknown" diff --git a/capa/features/extractors/elffile.py b/capa/features/extractors/elffile.py index 9077f97c..d4f61a06 100644 --- a/capa/features/extractors/elffile.py +++ b/capa/features/extractors/elffile.py @@ -7,7 +7,6 @@ # See the License for the specific language governing permissions and limitations under the License. import io import logging -import contextlib from typing import Tuple, Iterator from elftools.elf.elffile import ELFFile, SymbolTableSection @@ -16,7 +15,6 @@ import capa.features.extractors.common from capa.features.file import Import, Section from capa.features.common import OS, FORMAT_ELF, Arch, Format, Feature from capa.features.address import NO_ADDRESS, FileOffsetAddress, AbsoluteVirtualAddress -from capa.features.extractors.elf import Arch as ElfArch from capa.features.extractors.base_extractor import FeatureExtractor logger = logging.getLogger(__name__) @@ -26,17 +24,17 @@ def extract_file_import_names(elf, **kwargs): # see https://github.com/eliben/pyelftools/blob/0664de05ed2db3d39041e2d51d19622a8ef4fb0f/scripts/readelf.py#L372 symbol_tables = [(idx, s) for idx, s in enumerate(elf.iter_sections()) if isinstance(s, SymbolTableSection)] - for section_index, section in symbol_tables: + for _, section in symbol_tables: if not isinstance(section, SymbolTableSection): continue if section["sh_entsize"] == 0: - logger.debug("Symbol table '%s' has a sh_entsize of zero!" % (section.name)) + logger.debug("Symbol table '%s' has a sh_entsize of zero!", section.name) continue - logger.debug("Symbol table '%s' contains %s entries:" % (section.name, section.num_symbols())) + logger.debug("Symbol table '%s' contains %s entries:", section.name, section.num_symbols()) - for nsym, symbol in enumerate(section.iter_symbols()): + for _, symbol in enumerate(section.iter_symbols()): if symbol.name and symbol.entry.st_info.type == "STT_FUNC": # TODO symbol address # TODO symbol version info? @@ -73,9 +71,9 @@ def extract_file_arch(elf, **kwargs): # TODO merge with capa.features.extractors.elf.detect_elf_arch() arch = elf.get_machine_arch() if arch == "x86": - yield Arch(ElfArch.I386), NO_ADDRESS + yield Arch("i386"), NO_ADDRESS elif arch == "x64": - yield Arch(ElfArch.AMD64), NO_ADDRESS + yield Arch("amd64"), NO_ADDRESS else: logger.warning("unsupported architecture: %s", arch) @@ -110,7 +108,7 @@ GLOBAL_HANDLERS = ( class ElfFeatureExtractor(FeatureExtractor): def __init__(self, path: str): - super(ElfFeatureExtractor, self).__init__() + super().__init__() self.path = path with open(self.path, "rb") as f: self.elf = ELFFile(io.BytesIO(f.read())) @@ -153,8 +151,8 @@ class ElfFeatureExtractor(FeatureExtractor): def extract_insn_features(self, f, bb, insn): raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features") - def is_library_function(self, va): + def is_library_function(self, addr): raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features") - def get_function_name(self, va): + def get_function_name(self, addr): raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features") diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index a09d8fe3..1a587fa6 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -23,7 +23,7 @@ from capa.features.extractors.base_extractor import BBHandle, InsnHandle, Functi class IdaFeatureExtractor(FeatureExtractor): def __init__(self): - super(IdaFeatureExtractor, self).__init__() + super().__init__() self.global_features: List[Tuple[Feature, Address]] = [] self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) diff --git a/capa/features/extractors/ida/file.py b/capa/features/extractors/ida/file.py index eefef531..31c7fb09 100644 --- a/capa/features/extractors/ida/file.py +++ b/capa/features/extractors/ida/file.py @@ -115,6 +115,9 @@ def extract_file_import_names() -> Iterator[Tuple[Feature, Address]]: for name in capa.features.extractors.helpers.generate_symbols(dll, symbol): yield Import(name), addr + for (ea, info) in capa.features.extractors.ida.helpers.get_file_externs().items(): + yield Import(info[1]), AbsoluteVirtualAddress(ea) + def extract_file_section_names() -> Iterator[Tuple[Feature, Address]]: """extract section names @@ -165,7 +168,7 @@ def extract_file_function_names() -> Iterator[Tuple[Feature, Address]]: def extract_file_format() -> Iterator[Tuple[Feature, Address]]: file_info = idaapi.get_inf_structure() - if file_info.filetype == idaapi.f_PE: + if file_info.filetype in (idaapi.f_PE, idaapi.f_COFF): yield Format(FORMAT_PE), NO_ADDRESS elif file_info.filetype == idaapi.f_ELF: yield Format(FORMAT_ELF), NO_ADDRESS @@ -173,7 +176,7 @@ def extract_file_format() -> Iterator[Tuple[Feature, Address]]: # no file type to return when processing a binary file, but we want to continue processing return else: - raise NotImplementedError("file format: %d" % file_info.filetype) + raise NotImplementedError("unexpected file format: %d" % file_info.filetype) def extract_features() -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/ida/helpers.py b/capa/features/extractors/ida/helpers.py index 186723d2..3b411654 100644 --- a/capa/features/extractors/ida/helpers.py +++ b/capa/features/extractors/ida/helpers.py @@ -5,12 +5,13 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. -from typing import Any, Dict, Tuple, Iterator +from typing import Any, Dict, Tuple, Iterator, Optional import idc import idaapi import idautils import ida_bytes +import ida_segment from capa.features.address import AbsoluteVirtualAddress from capa.features.extractors.base_extractor import FunctionHandle @@ -35,7 +36,7 @@ def find_byte_sequence(start: int, end: int, seq: bytes) -> Iterator[int]: def get_functions( - start: int = None, end: int = None, skip_thunks: bool = False, skip_libs: bool = False + start: Optional[int] = None, end: Optional[int] = None, skip_thunks: bool = False, skip_libs: bool = False ) -> Iterator[FunctionHandle]: """get functions, range optional @@ -109,6 +110,19 @@ def get_file_imports() -> Dict[int, Tuple[str, str, int]]: return imports +def get_file_externs() -> Dict[int, Tuple[str, str, int]]: + externs = {} + + for seg in get_segments(skip_header_segments=True): + if not (seg.type == ida_segment.SEG_XTRN): + continue + + for ea in idautils.Functions(seg.start_ea, seg.end_ea): + externs[ea] = ("", idaapi.get_func_name(ea), -1) + + return externs + + def get_instructions_in_range(start: int, end: int) -> Iterator[idaapi.insn_t]: """yield instructions in range @@ -207,7 +221,8 @@ def get_op_phrase_info(op: idaapi.op_t) -> Dict: return {} scale = 1 << ((op.specflag2 & 0xC0) >> 6) - offset = op.addr + # IDA ea_t may be 32- or 64-bit; we assume displacement can only be 32-bit + offset = op.addr & 0xFFFFFFFF if op.specflag1 == 0: index = None @@ -273,7 +288,7 @@ def is_frame_register(reg: int) -> bool: return reg in (idautils.procregs.sp.reg, idautils.procregs.bp.reg) -def get_insn_ops(insn: idaapi.insn_t, target_ops: Tuple[Any] = None) -> idaapi.op_t: +def get_insn_ops(insn: idaapi.insn_t, target_ops: Optional[Tuple[Any]] = None) -> idaapi.op_t: """yield op_t for instruction, filter on type if specified""" for op in insn.ops: if op.type == idaapi.o_void: diff --git a/capa/features/extractors/ida/insn.py b/capa/features/extractors/ida/insn.py index 75ad987c..b160cbc6 100644 --- a/capa/features/extractors/ida/insn.py +++ b/capa/features/extractors/ida/insn.py @@ -23,13 +23,19 @@ from capa.features.extractors.base_extractor import BBHandle, InsnHandle, Functi SECURITY_COOKIE_BYTES_DELTA = 0x40 -def get_imports(ctx: Dict[str, Any]) -> Dict[str, Any]: +def get_imports(ctx: Dict[str, Any]) -> Dict[int, Any]: if "imports_cache" not in ctx: ctx["imports_cache"] = capa.features.extractors.ida.helpers.get_file_imports() return ctx["imports_cache"] -def check_for_api_call(ctx: Dict[str, Any], insn: idaapi.insn_t) -> Iterator[str]: +def get_externs(ctx: Dict[str, Any]) -> Dict[int, Any]: + if "externs_cache" not in ctx: + ctx["externs_cache"] = capa.features.extractors.ida.helpers.get_file_externs() + return ctx["externs_cache"] + + +def check_for_api_call(insn: idaapi.insn_t, funcs: Dict[int, Any]) -> Iterator[Any]: """check instruction for API call""" info = () ref = insn.ea @@ -46,7 +52,7 @@ def check_for_api_call(ctx: Dict[str, Any], insn: idaapi.insn_t) -> Iterator[str except IndexError: break - info = get_imports(ctx).get(ref, ()) + info = funcs.get(ref, ()) if info: break @@ -55,7 +61,7 @@ def check_for_api_call(ctx: Dict[str, Any], insn: idaapi.insn_t) -> Iterator[str break if info: - yield "%s.%s" % (info[0], info[1]) + yield info def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: @@ -70,11 +76,17 @@ def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) if not insn.get_canon_mnem() in ("call", "jmp"): return - for api in check_for_api_call(fh.ctx, insn): - dll, _, symbol = api.rpartition(".") - for name in capa.features.extractors.helpers.generate_symbols(dll, symbol): + # check calls to imported functions + for api in check_for_api_call(insn, get_imports(fh.ctx)): + # tuple (, , ) + for name in capa.features.extractors.helpers.generate_symbols(api[0], api[1]): yield API(name), ih.address + # check calls to extern functions + for api in check_for_api_call(insn, get_externs(fh.ctx)): + # tuple (, , ) + yield API(api[1]), ih.address + # extract IDA/FLIRT recognized API functions targets = tuple(idautils.CodeRefsFrom(insn.ea, False)) if not targets: @@ -201,7 +213,11 @@ def extract_insn_offset_features( continue p_info = capa.features.extractors.ida.helpers.get_op_phrase_info(op) - op_off = p_info.get("offset", 0) + + op_off = p_info.get("offset", None) + if op_off is None: + continue + if idaapi.is_mapped(op_off): # Ignore: # mov esi, dword_1005B148[esi] diff --git a/capa/features/extractors/null.py b/capa/features/extractors/null.py index d5cf72ab..892eadc8 100644 --- a/capa/features/extractors/null.py +++ b/capa/features/extractors/null.py @@ -52,26 +52,21 @@ class NullFeatureExtractor(FeatureExtractor): yield FunctionHandle(address, None) def extract_function_features(self, f): - for address, feature in self.functions.get(f.address, {}).features: + for address, feature in self.functions[f.address].features: yield feature, address def get_basic_blocks(self, f): - for address in sorted(self.functions.get(f.address, {}).basic_blocks.keys()): + for address in sorted(self.functions[f.address].basic_blocks.keys()): yield BBHandle(address, None) def extract_basic_block_features(self, f, bb): - for address, feature in self.functions.get(f.address, {}).basic_blocks.get(bb.address, {}).features: + for address, feature in self.functions[f.address].basic_blocks[bb.address].features: yield feature, address def get_instructions(self, f, bb): - for address in sorted(self.functions.get(f.address, {}).basic_blocks.get(bb.address, {}).instructions.keys()): + for address in sorted(self.functions[f.address].basic_blocks[bb.address].instructions.keys()): yield InsnHandle(address, None) def extract_insn_features(self, f, bb, insn): - for address, feature in ( - self.functions.get(f.address, {}) - .basic_blocks.get(bb.address, {}) - .instructions.get(insn.address, {}) - .features - ): + for address, feature in self.functions[f.address].basic_blocks[bb.address].instructions[insn.address].features: yield feature, address diff --git a/capa/features/extractors/pefile.py b/capa/features/extractors/pefile.py index e6449096..038200b8 100644 --- a/capa/features/extractors/pefile.py +++ b/capa/features/extractors/pefile.py @@ -133,7 +133,8 @@ def extract_file_features(pe, buf): """ for file_handler in FILE_HANDLERS: - for feature, va in file_handler(pe=pe, buf=buf): + # file_handler: type: (pe, bytes) -> Iterable[Tuple[Feature, Address]] + for feature, va in file_handler(pe=pe, buf=buf): # type: ignore yield feature, va @@ -160,7 +161,8 @@ def extract_global_features(pe, buf): Tuple[Feature, VA]: a feature and its location. """ for handler in GLOBAL_HANDLERS: - for feature, va in handler(pe=pe, buf=buf): + # file_handler: type: (pe, bytes) -> Iterable[Tuple[Feature, Address]] + for feature, va in handler(pe=pe, buf=buf): # type: ignore yield feature, va @@ -172,7 +174,7 @@ GLOBAL_HANDLERS = ( class PefileFeatureExtractor(FeatureExtractor): def __init__(self, path: str): - super(PefileFeatureExtractor, self).__init__() + super().__init__() self.path = path self.pe = pefile.PE(path) diff --git a/capa/features/extractors/viv/basicblock.py b/capa/features/extractors/viv/basicblock.py index 6341ec3a..9848bec0 100644 --- a/capa/features/extractors/viv/basicblock.py +++ b/capa/features/extractors/viv/basicblock.py @@ -31,7 +31,7 @@ def interface_extract_basic_block_XXX(f: FunctionHandle, bb: BBHandle) -> Iterat yields: (Feature, Address): the feature and the address at which its found. """ - ... + raise NotImplementedError def _bb_has_tight_loop(f, bb): diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index a99f9e75..53683f66 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -26,7 +26,7 @@ logger = logging.getLogger(__name__) class VivisectFeatureExtractor(FeatureExtractor): def __init__(self, vw, path): - super(VivisectFeatureExtractor, self).__init__() + super().__init__() self.vw = vw self.path = path with open(self.path, "rb") as f: diff --git a/capa/features/extractors/viv/function.py b/capa/features/extractors/viv/function.py index 64671711..cf1df527 100644 --- a/capa/features/extractors/viv/function.py +++ b/capa/features/extractors/viv/function.py @@ -27,7 +27,7 @@ def interface_extract_function_XXX(fh: FunctionHandle) -> Iterator[Tuple[Feature yields: (Feature, Address): the feature and the address at which its found. """ - ... + raise NotImplementedError def extract_function_calls_to(fhandle: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py index ae106c31..738c69a7 100644 --- a/capa/features/extractors/viv/insn.py +++ b/capa/features/extractors/viv/insn.py @@ -44,7 +44,7 @@ def interface_extract_instruction_XXX( yields: (Feature, Address): the feature and the address at which its found. """ - ... + raise NotImplementedError def get_imports(vw): diff --git a/capa/features/file.py b/capa/features/file.py index a9b4598e..735464c6 100644 --- a/capa/features/file.py +++ b/capa/features/file.py @@ -12,19 +12,19 @@ from capa.features.common import Feature class Export(Feature): def __init__(self, value: str, description=None): # value is export name - super(Export, self).__init__(value, description=description) + super().__init__(value, description=description) class Import(Feature): def __init__(self, value: str, description=None): # value is import name - super(Import, self).__init__(value, description=description) + super().__init__(value, description=description) class Section(Feature): def __init__(self, value: str, description=None): # value is section name - super(Section, self).__init__(value, description=description) + super().__init__(value, description=description) class FunctionName(Feature): @@ -32,7 +32,7 @@ class FunctionName(Feature): def __init__(self, name: str, description=None): # value is function name - super(FunctionName, self).__init__(name, description=description) + super().__init__(name, description=description) # override the name property set by `capa.features.Feature` # that would be `functionname` (note missing dash) self.name = "function-name" diff --git a/capa/features/insn.py b/capa/features/insn.py index 50dd6133..e5c1a49e 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -8,6 +8,7 @@ import abc from typing import Union, Optional +import capa.helpers from capa.features.common import VALID_FEATURE_ACCESS, Feature @@ -21,13 +22,13 @@ def hex(n: int) -> str: class API(Feature): def __init__(self, name: str, description=None): - super(API, self).__init__(name, description=description) + super().__init__(name, description=description) class _AccessFeature(Feature, abc.ABC): # superclass: don't use directly def __init__(self, value: str, access: Optional[str] = None, description: Optional[str] = None): - super(_AccessFeature, self).__init__(value, description=description) + super().__init__(value, description=description) if access is not None: if access not in VALID_FEATURE_ACCESS: raise ValueError("%s access type %s not valid" % (self.name, access)) @@ -47,16 +48,16 @@ class _AccessFeature(Feature, abc.ABC): class Property(_AccessFeature): def __init__(self, value: str, access: Optional[str] = None, description=None): - super(Property, self).__init__(value, access=access, description=description) + super().__init__(value, access=access, description=description) class Number(Feature): def __init__(self, value: Union[int, float], description=None): - super(Number, self).__init__(value, description=description) + super().__init__(value, description=description) def get_value_str(self): if isinstance(self.value, int): - return hex(self.value) + return capa.helpers.hex(self.value) elif isinstance(self.value, float): return str(self.value) else: @@ -69,15 +70,16 @@ MAX_STRUCTURE_SIZE = 0x10000 class Offset(Feature): def __init__(self, value: int, description=None): - super(Offset, self).__init__(value, description=description) + super().__init__(value, description=description) def get_value_str(self): + assert isinstance(self.value, int) return hex(self.value) class Mnemonic(Feature): def __init__(self, value: str, description=None): - super(Mnemonic, self).__init__(value, description=description) + super().__init__(value, description=description) # max number of operands to consider for a given instrucion. @@ -91,7 +93,7 @@ class _Operand(Feature, abc.ABC): # superclass: don't use directly # subclasses should set self.name and provide the value string formatter def __init__(self, index: int, value: int, description=None): - super(_Operand, self).__init__(value, description=description) + super().__init__(value, description=description) self.index = index def __hash__(self): @@ -107,7 +109,7 @@ class OperandNumber(_Operand): # operand[i].number: 0x12 def __init__(self, index: int, value: int, description=None): - super(OperandNumber, self).__init__(index, value, description=description) + super().__init__(index, value, description=description) self.name = self.NAMES[index] def get_value_str(self) -> str: @@ -121,7 +123,7 @@ class OperandOffset(_Operand): # operand[i].offset: 0x12 def __init__(self, index: int, value: int, description=None): - super(OperandOffset, self).__init__(index, value, description=description) + super().__init__(index, value, description=description) self.name = self.NAMES[index] def get_value_str(self) -> str: diff --git a/capa/helpers.py b/capa/helpers.py index 9c4c285e..2e44fc6c 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -18,11 +18,13 @@ EXTENSIONS_ELF = "elf_" logger = logging.getLogger("capa") -_hex = hex - -def hex(i): - return _hex(int(i)) +def hex(n: int) -> str: + """render the given number using upper case hex, like: 0x123ABC""" + if n < 0: + return "-0x%X" % (-n) + else: + return "0x%X" % n def get_file_taste(sample_path: str) -> bytes: diff --git a/capa/ida/helpers.py b/capa/ida/helpers.py index 27b22a99..d1ef3093 100644 --- a/capa/ida/helpers.py +++ b/capa/ida/helpers.py @@ -27,6 +27,7 @@ SUPPORTED_FILE_TYPES = ( idaapi.f_PE, idaapi.f_ELF, idaapi.f_BIN, + idaapi.f_COFF, # idaapi.f_MACHO, ) @@ -170,7 +171,7 @@ class IDAIO: """ def __init__(self): - super(IDAIO, self).__init__() + super().__init__() self.offset = 0 def seek(self, offset, whence=0): diff --git a/capa/ida/plugin/README.md b/capa/ida/plugin/README.md index 4364d4aa..23002b97 100644 --- a/capa/ida/plugin/README.md +++ b/capa/ida/plugin/README.md @@ -1,8 +1,8 @@ ![capa explorer](../../../.github/capa-explorer-logo.png) capa explorer is an IDAPython plugin that integrates the FLARE team's open-source framework, capa, with IDA Pro. capa is a framework that uses a well-defined collection of rules to -identify capabilities in a program. You can run capa against a PE file or shellcode and it tells you what it thinks the program can do. For example, it might suggest that -the program is a backdoor, can install services, or relies on HTTP to communicate. capa explorer runs capa directly against your IDA Pro database (IDB) without requiring access +identify capabilities in a program. You can run capa against a PE file, ELF file, or shellcode and it tells you what it thinks the program can do. For example, it might suggest that +the program is a backdoor, can install services, or relies on HTTP to communicate. capa explorer runs capa analysis on your IDA Pro database (IDB) without needing access to the original binary file. Once a database has been analyzed, capa explorer helps you identify interesting areas of a program and build new capa rules using features extracted from your IDB. We love using capa explorer during malware analysis because it teaches us what parts of a program suggest a behavior. As we click on rows, capa explorer jumps directly @@ -21,10 +21,10 @@ We can use capa explorer to navigate our Disassembly view directly to the suspec Using the `Rule Information` and `Details` columns capa explorer shows us that the suspect function matched `self delete via COMSPEC environment variable` because it contains capa rule matches for `create process`, `get COMSPEC environment variable`, and `query environment variable`, references to the strings `COMSPEC`, ` > nul`, and `/c del `, and calls to the Windows API functions `GetEnvironmentVariableA` and `ShellExecuteEx`. -capa explorer also helps you build new capa rules. To start select the `Rule Generator` tab, navigate to a function in your Disassembly view, +capa explorer also helps you build and test new capa rules. To start, select the `Rule Generator` tab, navigate to a function in your Disassembly view, and click `Analyze`. capa explorer will extract features from the function and display them in the `Features` pane. You can add features listed in this pane to the `Editor` pane by either double-clicking a feature or using multi-select + right-click to add multiple features at once. The `Preview` and `Editor` panes help edit your rule. Use the `Preview` pane -to modify the rule text directly and the `Editor` pane to construct and rearrange your hierarchy of statements and features. When you finish a rule you can save it directly to a file by clicking `Save`. +to modify rule text directly and the `Editor` pane to construct and rearrange your hierarchy of statements and features. When you finish a rule you can save it directly to a file by clicking `Save`. ![](../../../doc/img/rulegen_expanded.png) @@ -32,62 +32,28 @@ For more information on the FLARE team's open-source framework, capa, check out ## Getting Started -### Requirements +### Installation -capa explorer supports Python versions >= 3.7.x and the following IDA Pro versions: - -* IDA 7.4 -* IDA 7.5 -* IDA 7.6 (caveat below) -* IDA 7.7 - -capa explorer is however limited to the Python versions supported by your IDA installation (which may not include all Python versions >= 3.7.x). Based on our testing the following matrix shows the Python versions supported -by each supported IDA version: - -| | IDA 7.4 | IDA 7.5 | IDA 7.6 | -| --- | --- | --- | --- | -| Python 3.7.x | Yes | Yes | Yes | -| Python 3.8.x | Partial (see below) | Yes | Yes | -| Python 3.9.x | No | Partial (see below) | Yes | - -To use capa explorer with IDA 7.4 and Python 3.8.x you must follow the instructions provided by hex-rays [here](https://hex-rays.com/blog/ida-7-4-and-python-3-8/). - -To use capa explorer with IDA 7.5 and Python 3.9.x you must follow the instructions provided by hex-rays [here](https://hex-rays.com/blog/python-3-9-support-for-ida-7-5/). - -If you encounter issues with your specific setup, please open a new [Issue](https://github.com/mandiant/capa/issues). - -#### IDA 7.6 caveat: IDA 7.6sp1 or patch required - -As described [here](https://www.hex-rays.com/blog/ida-7-6-empty-qtreeview-qtreewidget/): - -> A rather nasty issue evaded our testing and found its way into IDA 7.6: using the PyQt5 modules that are shipped with IDA, QTreeView (or QTreeWidget) instances will always fail to display contents. - -Therefore, in order to use capa under IDA 7.6 you need the [Service Pack 1 for IDA 7.6](https://www.hex-rays.com/products/ida/news/7_6sp1). Alternatively, you can download and install the fix corresponding to your IDA installation, replacing the original QtWidgets DLL with the one contained in the .zip file (links to Hex-Rays): - - - - Windows: [pyqt5_qtwidgets_win](https://www.hex-rays.com/wp-content/uploads/2021/04/pyqt5_qtwidgets_win.zip) - - Linux: [pyqt5_qtwidgets_linux](https://www.hex-rays.com/wp-content/uploads/2021/04/pyqt5_qtwidgets_linux.zip) - - MacOS (Intel): [pyqt5_qtwidgets_mac_x64](https://www.hex-rays.com/wp-content/uploads/2021/04/pyqt5_qtwidgets_mac_x64.zip) - - MacOS (AppleSilicon): [pyqt5_qtwidgets_mac_arm](https://www.hex-rays.com/wp-content/uploads/2021/04/pyqt5_qtwidgets_mac_arm.zip) +You can install capa explorer using the following steps: +1. Install capa and its dependencies from PyPI using the Python interpreter configured for your IDA installation: + ``` + $ pip install flare-capa + ``` +2. Download and extract the [official capa rules](https://github.com/mandiant/capa-rules/releases) that match the version of capa you have installed + 1. Use the following command to view the version of capa you have installed: + ```commandline + $ pip show flare-capa + ``` +3. Copy [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ida/plugin/capa_explorer.py) to your IDA plugins directory ### Supported File Types capa explorer is limited to the file types supported by capa, which include: -* Windows x86 (32- and 64-bit) PE and ELF files +* Windows x86 (32- and 64-bit) PE files * Windows x86 (32- and 64-bit) shellcode - -### Installation - -You can install capa explorer using the following steps: - -1. Install capa and its dependencies from PyPI for the Python interpreter used by your IDA installation: - ``` - $ pip install flare-capa - ``` -3. Download the [standard collection of capa rules](https://github.com/mandiant/capa-rules) (capa explorer needs capa rules to analyze a database) -4. Copy [capa_explorer.py](https://raw.githubusercontent.com/mandiant/capa/master/capa/ida/plugin/capa_explorer.py) to your IDA plugins directory +* ELF files on various operating systems ### Usage @@ -97,15 +63,15 @@ You can install capa explorer using the following steps: 3. Select the `Program Analysis` tab 4. Click the `Analyze` button -When running capa explorer for the first time you are prompted to select a file directory containing capa rules. The plugin conveniently -remembers your selection for future runs; you can change this selection and other default settings by clicking `Settings`. We recommend -downloading and using the [standard collection of capa rules](https://github.com/mandiant/capa-rules) when getting started with the plugin. +The first time you run capa explorer you will be asked to specify a local directory containing capa rules to use for analysis. We recommend downloading and extracting the [official capa rules](https://github.com/mandiant/capa-rules/releases) that match +the version of capa you have installed (see installation instructions above for more details). capa explorer remembers your selection for future analysis which you +can update using the `Settings` button. #### Tips for Program Analysis * Start analysis by clicking the `Analyze` button * Reset the plugin user interface and remove highlighting from your Disassembly view by clicking the `Reset` button -* Change your capa rules directory and other default settings by clicking `Settings` +* Change your local capa rules directory and other default settings by clicking the `Settings` button * Hover your cursor over a rule match to view the source content of the rule * Double-click the `Address` column to navigate your Disassembly view to the address of the associated feature * Double-click a result in the `Rule Information` column to expand its children @@ -122,6 +88,22 @@ downloading and using the [standard collection of capa rules](https://github.com * Directly edit rule text and metadata fields using the `Preview` pane * Change the default rule author and default rule scope displayed in the `Preview` pane by clicking `Settings` +### Requirements + +capa explorer supports Python versions >= 3.7.x and IDA Pro versions >= 7.4. The following IDA Pro versions have been tested: + +* IDA 7.4 +* IDA 7.5 +* IDA 7.6 Service Pack 1 +* IDA 7.7 +* IDA 8.0 +* IDA 8.1 +* IDA 8.2 + +capa explorer is however limited to the Python versions supported by your IDA installation (which may not include all Python versions >= 3.7.x). + +If you encounter issues with your specific setup, please open a new [Issue](https://github.com/mandiant/capa/issues). + ## Development capa explorer is packaged with capa so you will need to install capa locally for development. You can install capa locally by following the steps outlined in `Method 3: Inspecting the capa source code` of the [capa diff --git a/capa/ida/plugin/__init__.py b/capa/ida/plugin/__init__.py index 6100f3db..4ffc09af 100644 --- a/capa/ida/plugin/__init__.py +++ b/capa/ida/plugin/__init__.py @@ -93,7 +93,7 @@ class OnUpdatedActionsHook(ida_kernwin.UI_Hooks): """register a callback to be invoked each time the UI actions are updated""" def __init__(self, cb): - super(OnUpdatedActionsHook, self).__init__() + super().__init__() self.cb = cb def updated_actions(self): diff --git a/capa/ida/plugin/form.py b/capa/ida/plugin/form.py index 41d6ed94..065b09f4 100644 --- a/capa/ida/plugin/form.py +++ b/capa/ida/plugin/form.py @@ -11,7 +11,7 @@ import copy import logging import itertools import collections -from typing import Set, Dict, Optional +from typing import Any, Set, Dict, List, Optional import idaapi import ida_kernwin @@ -21,6 +21,7 @@ from PyQt5 import QtGui, QtCore, QtWidgets import capa.main import capa.rules import capa.engine +import capa.version import capa.ida.helpers import capa.render.json import capa.features.common @@ -48,6 +49,11 @@ CAPA_SETTINGS_RULE_PATH = "rule_path" CAPA_SETTINGS_RULEGEN_AUTHOR = "rulegen_author" CAPA_SETTINGS_RULEGEN_SCOPE = "rulegen_scope" + +CAPA_OFFICIAL_RULESET_URL = f"https://github.com/mandiant/capa-rules/releases/tag/v{capa.version.__version__}" +CAPA_RULESET_DOC_URL = "https://github.com/mandiant/capa/blob/master/doc/rules.md" + + from enum import IntFlag @@ -72,14 +78,14 @@ def trim_function_name(f, max_length=25): def find_func_features(fh: FunctionHandle, extractor): """ """ - func_features: Dict[Feature, Set] = collections.defaultdict(set) - bb_features: Dict[Address, Dict] = collections.defaultdict(dict) + func_features: Dict[Feature, Set[Address]] = collections.defaultdict(set) + bb_features: Dict[Address, Dict[Feature, Set[Address]]] = collections.defaultdict(dict) for (feature, addr) in extractor.extract_function_features(fh): func_features[feature].add(addr) for bbh in extractor.get_basic_blocks(fh): - _bb_features = collections.defaultdict(set) + _bb_features: Dict[Feature, Set[Address]] = collections.defaultdict(set) for (feature, addr) in extractor.extract_basic_block_features(fh, bbh): _bb_features[feature].add(addr) @@ -155,7 +161,7 @@ class CapaExplorerProgressIndicator(QtCore.QObject): def __init__(self): """initialize signal object""" - super(CapaExplorerProgressIndicator, self).__init__() + super().__init__() def update(self, text): """emit progress update @@ -174,18 +180,18 @@ class CapaExplorerFeatureExtractor(capa.features.extractors.ida.extractor.IdaFea """ def __init__(self): - super(CapaExplorerFeatureExtractor, self).__init__() + super().__init__() self.indicator = CapaExplorerProgressIndicator() def extract_function_features(self, fh: FunctionHandle): self.indicator.update("function at 0x%X" % fh.inner.start_ea) - return super(CapaExplorerFeatureExtractor, self).extract_function_features(fh) + return super().extract_function_features(fh) class QLineEditClicked(QtWidgets.QLineEdit): def __init__(self, content, parent=None): """ """ - super(QLineEditClicked, self).__init__(content, parent) + super().__init__(content, parent) def mouseReleaseEvent(self, e): """ """ @@ -204,7 +210,7 @@ class QLineEditClicked(QtWidgets.QLineEdit): class CapaSettingsInputDialog(QtWidgets.QDialog): def __init__(self, title, parent=None): """ """ - super(CapaSettingsInputDialog, self).__init__(parent) + super().__init__(parent) self.setWindowTitle(title) self.setMinimumWidth(500) @@ -213,6 +219,12 @@ class CapaSettingsInputDialog(QtWidgets.QDialog): self.edit_rule_path = QLineEditClicked(settings.user.get(CAPA_SETTINGS_RULE_PATH, "")) self.edit_rule_author = QtWidgets.QLineEdit(settings.user.get(CAPA_SETTINGS_RULEGEN_AUTHOR, "")) self.edit_rule_scope = QtWidgets.QComboBox() + self.edit_rules_link = QtWidgets.QLabel() + + self.edit_rules_link.setText( + f'Download and extract official capa rules' + ) + self.edit_rules_link.setOpenExternalLinks(True) scopes = ("file", "function", "basic block") @@ -222,7 +234,8 @@ class CapaSettingsInputDialog(QtWidgets.QDialog): buttons = QtWidgets.QDialogButtonBox(QtWidgets.QDialogButtonBox.Ok | QtWidgets.QDialogButtonBox.Cancel, self) layout = QtWidgets.QFormLayout(self) - layout.addRow("capa rules path", self.edit_rule_path) + layout.addRow("capa rules", self.edit_rule_path) + layout.addRow("", self.edit_rules_link) layout.addRow("Default rule author", self.edit_rule_author) layout.addRow("Default rule scope", self.edit_rule_scope) @@ -239,53 +252,52 @@ class CapaSettingsInputDialog(QtWidgets.QDialog): class CapaExplorerForm(idaapi.PluginForm): """form element for plugin interface""" - def __init__(self, name, option=Options.DEFAULT): + def __init__(self, name: str, option=Options.DEFAULT): """initialize form elements""" - super(CapaExplorerForm, self).__init__() + super().__init__() - self.form_title = name - self.process_total = 0 - self.process_count = 0 + self.form_title: str = name + self.process_total: int = 0 + self.process_count: int = 0 - self.parent = None - self.ida_hooks = None + self.parent: Any # QtWidget + self.ida_hooks: CapaExplorerIdaHooks self.doc: Optional[capa.render.result_document.ResultDocument] = None - self.rule_paths = None - self.rules_cache = None - self.ruleset_cache = None + self.rule_paths: Optional[List[str]] + self.rules_cache: Optional[List[capa.rules.Rule]] + self.ruleset_cache: Optional[capa.rules.RuleSet] # models - self.model_data = None - self.range_model_proxy = None - self.search_model_proxy = None + self.model_data: CapaExplorerDataModel + self.range_model_proxy: CapaExplorerRangeProxyModel + self.search_model_proxy: CapaExplorerSearchProxyModel # UI controls - self.view_limit_results_by_function = None - self.view_show_results_by_function = None - self.view_search_bar = None - self.view_tree = None - self.view_rulegen = None - self.view_tabs = None + self.view_limit_results_by_function: QtWidgets.QCheckBox + self.view_show_results_by_function: QtWidgets.QCheckBox + self.view_search_bar: QtWidgets.QLineEdit + self.view_tree: CapaExplorerQtreeView + self.view_tabs: QtWidgets.QTabWidget self.view_tab_rulegen = None - self.view_status_label = None - self.view_buttons = None - self.view_analyze_button = None - self.view_reset_button = None - self.view_settings_button = None - self.view_save_button = None + self.view_status_label: QtWidgets.QLabel + self.view_buttons: QtWidgets.QHBoxLayout + self.view_analyze_button: QtWidgets.QPushButton + self.view_reset_button: QtWidgets.QPushButton + self.view_settings_button: QtWidgets.QPushButton + self.view_save_button: QtWidgets.QPushButton - self.view_rulegen_preview = None - self.view_rulegen_features = None - self.view_rulegen_editor = None - self.view_rulegen_header_label = None - self.view_rulegen_search = None - self.view_rulegen_limit_features_by_ea = None - self.rulegen_current_function = None - self.rulegen_bb_features_cache = {} - self.rulegen_func_features_cache = {} - self.rulegen_file_features_cache = {} - self.view_rulegen_status_label = None + self.view_rulegen_preview: CapaExplorerRulegenPreview + self.view_rulegen_features: CapaExplorerRulegenFeatures + self.view_rulegen_editor: CapaExplorerRulegenEditor + self.view_rulegen_header_label: QtWidgets.QLabel + self.view_rulegen_search: QtWidgets.QLineEdit + self.view_rulegen_limit_features_by_ea: QtWidgets.QCheckBox + self.rulegen_current_function: Optional[FunctionHandle] + self.rulegen_bb_features_cache: Dict[Address, Dict[Feature, Set[Address]]] = {} + self.rulegen_func_features_cache: Dict[Feature, Set[Address]] = {} + self.rulegen_file_features_cache: Dict[Feature, Set[Address]] = {} + self.view_rulegen_status_label: QtWidgets.QLabel self.Show() @@ -305,7 +317,7 @@ class CapaExplorerForm(idaapi.PluginForm): def Show(self): """creates form if not already create, else brings plugin to front""" - return super(CapaExplorerForm, self).Show( + return super().Show( self.form_title, options=( idaapi.PluginForm.WOPN_TAB @@ -614,7 +626,7 @@ class CapaExplorerForm(idaapi.PluginForm): """ if post: if idaapi.get_imagebase() != meta.get("prev_base", -1): - capa.ida.helpers.inform_user_ida_ui("Running capa analysis again after program rebase") + capa.ida.helpers.inform_user_ida_ui("Running capa analysis using new program base") self.slot_analyze() else: meta["prev_base"] = idaapi.get_imagebase() @@ -629,15 +641,36 @@ class CapaExplorerForm(idaapi.PluginForm): try: # resolve rules directory - check self and settings first, then ask user if not os.path.exists(settings.user.get(CAPA_SETTINGS_RULE_PATH, "")): - idaapi.info("Please select a file directory containing capa rules.") + # configure rules selection messagebox + rules_message = QtWidgets.QMessageBox() + rules_message.setIcon(QtWidgets.QMessageBox.Information) + rules_message.setWindowTitle("capa explorer") + rules_message.setText("You must specify a directory containing capa rules before running analysis.") + rules_message.setInformativeText( + "Click 'Ok' to specify a local directory of rules or you can download and extract the official " + f"rules from the URL listed in the details." + ) + rules_message.setDetailedText(f"{CAPA_OFFICIAL_RULESET_URL}") + rules_message.setStandardButtons(QtWidgets.QMessageBox.Ok | QtWidgets.QMessageBox.Cancel) + + # display rules selection messagebox, check user button selection + pressed = rules_message.exec_() + if pressed == QtWidgets.QMessageBox.Cancel: + raise UserCancelledError() + path = self.ask_user_directory() if not path: - logger.warning( - "You must select a file directory containing capa rules before analysis can be run. The standard collection of capa rules can be downloaded from https://github.com/mandiant/capa-rules." - ) - return False + raise UserCancelledError() + settings.user[CAPA_SETTINGS_RULE_PATH] = path + except UserCancelledError as e: + capa.ida.helpers.inform_user_ida_ui("Analysis requires capa rules") + logger.warning( + f"You must specify a directory containing capa rules before running analysis. Download and extract the official rules from {CAPA_OFFICIAL_RULESET_URL} (recommended)." + ) + return False except Exception as e: + capa.ida.helpers.inform_user_ida_ui("Failed to load capa rules") logger.error("Failed to load capa rules (error: %s).", e) return False @@ -700,22 +733,16 @@ class CapaExplorerForm(idaapi.PluginForm): capa.ida.helpers.inform_user_ida_ui( "Failed to load capa rules from %s" % settings.user[CAPA_SETTINGS_RULE_PATH] ) - logger.error("Failed to load rules from %s (error: %s).", settings.user[CAPA_SETTINGS_RULE_PATH], e) + + logger.error("Failed to load capa rules from %s (error: %s).", settings.user[CAPA_SETTINGS_RULE_PATH], e) logger.error( - "Make sure your file directory contains properly formatted capa rules. You can download the standard collection of capa rules from https://github.com/mandiant/capa-rules." - ) - logger.error( - "Please ensure you're using the rules that correspond to your major version of capa (%s)", - capa.version.get_major_version(), - ) - logger.error( - "You can check out these rules with the following command:\n %s", - capa.version.get_rules_checkout_command(), - ) - logger.error( - "Or, for more details, see the rule set documentation here: %s", - "https://github.com/mandiant/capa/blob/master/doc/rules.md", + "Make sure your file directory contains properly " + "formatted capa rules. You can download and extract the official rules from %s. " + "Or, for more details, see the rules documentation here: %s", + CAPA_OFFICIAL_RULESET_URL, + CAPA_RULESET_DOC_URL, ) + settings.user[CAPA_SETTINGS_RULE_PATH] = "" return False @@ -762,6 +789,9 @@ class CapaExplorerForm(idaapi.PluginForm): if not self.load_capa_rules(): return False + assert self.rules_cache is not None + assert self.ruleset_cache is not None + if ida_kernwin.user_cancelled(): logger.info("User cancelled analysis.") return False @@ -822,9 +852,16 @@ class CapaExplorerForm(idaapi.PluginForm): return False try: + # either the results are cached and the doc already exists, + # or the doc was just created above + assert self.doc is not None + # same with rules cache, either it's cached or it was just loaded + assert self.rules_cache is not None + assert self.ruleset_cache is not None + self.model_data.render_capa_doc(self.doc, self.view_show_results_by_function.isChecked()) self.set_view_status_label( - "capa rules directory: %s (%d rules)" % (settings.user[CAPA_SETTINGS_RULE_PATH], len(self.rules_cache)) + "capa rules: %s (%d rules)" % (settings.user[CAPA_SETTINGS_RULE_PATH], len(self.rules_cache)) ) except Exception as e: logger.error("Failed to render results (error: %s)", e, exc_info=True) @@ -869,7 +906,10 @@ class CapaExplorerForm(idaapi.PluginForm): if not self.load_capa_rules(): return False else: - logger.info('Using cached ruleset, click "Reset" to reload rules from disk.') + logger.info('Using cached capa rules, click "Reset" to load rules from disk.') + + assert self.rules_cache is not None + assert self.ruleset_cache is not None if ida_kernwin.user_cancelled(): logger.info("User cancelled analysis.") @@ -891,7 +931,8 @@ class CapaExplorerForm(idaapi.PluginForm): try: f = idaapi.get_func(idaapi.get_screen_ea()) if f: - fh: FunctionHandle = extractor.get_function(f.start_ea) + fh: Optional[FunctionHandle] = extractor.get_function(f.start_ea) + assert fh is not None self.rulegen_current_function = fh func_features, bb_features = find_func_features(fh, extractor) @@ -916,6 +957,7 @@ class CapaExplorerForm(idaapi.PluginForm): logger.error("Failed to match function/basic block rule scope (error: %s)", e) return False else: + fh = None func_features = {} except UserCancelledError: logger.info("User cancelled analysis.") @@ -1022,7 +1064,7 @@ class CapaExplorerForm(idaapi.PluginForm): self.view_rulegen_status_label.clear() if not is_analyze: - # clear rules and ruleset cache only if user clicked "Reset" + # clear rules and rule set cache only if user clicked "Reset" self.rules_cache = None self.ruleset_cache = None @@ -1052,6 +1094,8 @@ class CapaExplorerForm(idaapi.PluginForm): def update_rule_status(self, rule_text): """ """ + assert self.rules_cache is not None + if not self.view_rulegen_editor.invisibleRootItem().childCount(): self.set_rulegen_preview_border_neutral() self.view_rulegen_status_label.clear() @@ -1076,7 +1120,7 @@ class CapaExplorerForm(idaapi.PluginForm): rules.append(rule) try: - file_features = copy.copy(self.rulegen_file_features_cache) + file_features = copy.copy(dict(self.rulegen_file_features_cache)) if self.rulegen_current_function: func_matches, bb_matches = find_func_matches( self.rulegen_current_function, @@ -1092,7 +1136,7 @@ class CapaExplorerForm(idaapi.PluginForm): _, file_matches = capa.engine.match( capa.rules.RuleSet(list(capa.rules.get_rules_and_dependencies(rules, rule.name))).file_rules, file_features, - 0x0, + NO_ADDRESS, ) except Exception as e: self.set_rulegen_status("Failed to match rule (%s)" % e) diff --git a/capa/ida/plugin/hooks.py b/capa/ida/plugin/hooks.py index 9043f989..23d87821 100644 --- a/capa/ida/plugin/hooks.py +++ b/capa/ida/plugin/hooks.py @@ -16,7 +16,7 @@ class CapaExplorerIdaHooks(idaapi.UI_Hooks): @param screen_ea_changed_hook: function hook for IDA screen ea changed @param action_hooks: dict of IDA action handles """ - super(CapaExplorerIdaHooks, self).__init__() + super().__init__() self.screen_ea_changed_hook = screen_ea_changed_hook self.process_action_hooks = action_hooks diff --git a/capa/ida/plugin/item.py b/capa/ida/plugin/item.py index 5cbc4b24..ac349424 100644 --- a/capa/ida/plugin/item.py +++ b/capa/ida/plugin/item.py @@ -36,7 +36,7 @@ def ea_to_hex(ea): class CapaExplorerDataItem: """store data for CapaExplorerDataModel""" - def __init__(self, parent: "CapaExplorerDataItem", data: List[str], can_check=True): + def __init__(self, parent: Optional["CapaExplorerDataItem"], data: List[str], can_check=True): """initialize item""" self.pred = parent self._data = data @@ -110,7 +110,7 @@ class CapaExplorerDataItem: except IndexError: return None - def parent(self) -> "CapaExplorerDataItem": + def parent(self) -> Optional["CapaExplorerDataItem"]: """get parent""" return self.pred @@ -181,7 +181,7 @@ class CapaExplorerRuleItem(CapaExplorerDataItem): @param source: rule source (tooltip) """ display = self.fmt % (name, count) if count > 1 else name - super(CapaExplorerRuleItem, self).__init__(parent, [display, "", namespace], can_check) + super().__init__(parent, [display, "", namespace], can_check) self._source = source @property @@ -200,7 +200,7 @@ class CapaExplorerRuleMatchItem(CapaExplorerDataItem): @param display: text to display in UI @param source: rule match source to display (tooltip) """ - super(CapaExplorerRuleMatchItem, self).__init__(parent, [display, "", ""]) + super().__init__(parent, [display, "", ""]) self._source = source @property @@ -222,14 +222,12 @@ class CapaExplorerFunctionItem(CapaExplorerDataItem): """ assert isinstance(location, AbsoluteVirtualAddress) ea = int(location) - super(CapaExplorerFunctionItem, self).__init__( - parent, [self.fmt % idaapi.get_name(ea), ea_to_hex(ea), ""], can_check - ) + super().__init__(parent, [self.fmt % idaapi.get_name(ea), ea_to_hex(ea), ""], can_check) @property def info(self): """return function name""" - info = super(CapaExplorerFunctionItem, self).info + info = super().info display = info_to_name(info) return display if display else info @@ -255,7 +253,7 @@ class CapaExplorerSubscopeItem(CapaExplorerDataItem): @param parent: parent node @param scope: subscope name """ - super(CapaExplorerSubscopeItem, self).__init__(parent, [self.fmt % scope, "", ""]) + super().__init__(parent, [self.fmt % scope, "", ""]) class CapaExplorerBlockItem(CapaExplorerDataItem): @@ -271,7 +269,7 @@ class CapaExplorerBlockItem(CapaExplorerDataItem): """ assert isinstance(location, AbsoluteVirtualAddress) ea = int(location) - super(CapaExplorerBlockItem, self).__init__(parent, [self.fmt % ea, ea_to_hex(ea), ""]) + super().__init__(parent, [self.fmt % ea, ea_to_hex(ea), ""]) class CapaExplorerInstructionItem(CapaExplorerBlockItem): @@ -298,9 +296,7 @@ class CapaExplorerDefaultItem(CapaExplorerDataItem): assert isinstance(location, AbsoluteVirtualAddress) ea = int(location) - super(CapaExplorerDefaultItem, self).__init__( - parent, [display, ea_to_hex(ea) if ea is not None else "", details] - ) + super().__init__(parent, [display, ea_to_hex(ea) if ea is not None else "", details]) class CapaExplorerFeatureItem(CapaExplorerDataItem): @@ -319,9 +315,9 @@ class CapaExplorerFeatureItem(CapaExplorerDataItem): if location: assert isinstance(location, (AbsoluteVirtualAddress, FileOffsetAddress)) ea = int(location) - super(CapaExplorerFeatureItem, self).__init__(parent, [display, ea_to_hex(ea), details]) + super().__init__(parent, [display, ea_to_hex(ea), details]) else: - super(CapaExplorerFeatureItem, self).__init__(parent, [display, "", details]) + super().__init__(parent, [display, "", details]) class CapaExplorerInstructionViewItem(CapaExplorerFeatureItem): @@ -339,7 +335,7 @@ class CapaExplorerInstructionViewItem(CapaExplorerFeatureItem): assert isinstance(location, AbsoluteVirtualAddress) ea = int(location) details = capa.ida.helpers.get_disasm_line(ea) - super(CapaExplorerInstructionViewItem, self).__init__(parent, display, location=location, details=details) + super().__init__(parent, display, location=location, details=details) self.ida_highlight = idc.get_color(ea, idc.CIC_ITEM) @@ -365,7 +361,7 @@ class CapaExplorerByteViewItem(CapaExplorerFeatureItem): byte_snap = codecs.encode(byte_snap, "hex").upper() details = " ".join([byte_snap[i : i + 2].decode() for i in range(0, len(byte_snap), 2)]) - super(CapaExplorerByteViewItem, self).__init__(parent, display, location=location, details=details) + super().__init__(parent, display, location=location, details=details) self.ida_highlight = idc.get_color(ea, idc.CIC_ITEM) @@ -382,5 +378,5 @@ class CapaExplorerStringViewItem(CapaExplorerFeatureItem): assert isinstance(location, (AbsoluteVirtualAddress, FileOffsetAddress)) ea = int(location) - super(CapaExplorerStringViewItem, self).__init__(parent, display, location=location, details=value) + super().__init__(parent, display, location=location, details=value) self.ida_highlight = idc.get_color(ea, idc.CIC_ITEM) diff --git a/capa/ida/plugin/model.py b/capa/ida/plugin/model.py index 05ac83fb..d0a6a857 100644 --- a/capa/ida/plugin/model.py +++ b/capa/ida/plugin/model.py @@ -51,7 +51,7 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): def __init__(self, parent=None): """initialize model""" - super(CapaExplorerDataModel, self).__init__(parent) + super().__init__(parent) # root node does not have parent, contains header columns self.root_node = CapaExplorerDataItem(None, ["Rule Information", "Address", "Details"]) @@ -530,6 +530,14 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): if value: if isinstance(feature, frzf.StringFeature): value = '"%s"' % capa.features.common.escape_string(value) + + if isinstance(feature, frzf.PropertyFeature) and feature.access is not None: + key = f"property/{feature.access}" + elif isinstance(feature, frzf.OperandNumberFeature): + key = f"operand[{feature.index}].number" + elif isinstance(feature, frzf.OperandOffsetFeature): + key = f"operand[{feature.index}].offset" + if feature.description: return "%s(%s = %s)" % (key, value, feature.description) else: diff --git a/capa/ida/plugin/proxy.py b/capa/ida/plugin/proxy.py index f61a0eb0..e67147bd 100644 --- a/capa/ida/plugin/proxy.py +++ b/capa/ida/plugin/proxy.py @@ -22,7 +22,7 @@ class CapaExplorerRangeProxyModel(QtCore.QSortFilterProxyModel): def __init__(self, parent=None): """initialize proxy filter""" - super(CapaExplorerRangeProxyModel, self).__init__(parent) + super().__init__(parent) self.min_ea = None self.max_ea = None @@ -92,7 +92,7 @@ class CapaExplorerRangeProxyModel(QtCore.QSortFilterProxyModel): @param parent: QModelIndex of parent """ # filter not set - if self.min_ea is None and self.max_ea is None: + if self.min_ea is None or self.max_ea is None: return True index = self.sourceModel().index(row, 0, parent) @@ -145,7 +145,7 @@ class CapaExplorerSearchProxyModel(QtCore.QSortFilterProxyModel): def __init__(self, parent=None): """ """ - super(CapaExplorerSearchProxyModel, self).__init__(parent) + super().__init__(parent) self.query = "" self.setFilterKeyColumn(-1) # all columns diff --git a/capa/ida/plugin/view.py b/capa/ida/plugin/view.py index ef656736..0f577c7d 100644 --- a/capa/ida/plugin/view.py +++ b/capa/ida/plugin/view.py @@ -18,7 +18,7 @@ import capa.ida.helpers import capa.features.common import capa.features.basicblock from capa.ida.plugin.item import CapaExplorerFunctionItem -from capa.features.address import Address, _NoAddress +from capa.features.address import AbsoluteVirtualAddress, _NoAddress from capa.ida.plugin.model import CapaExplorerDataModel MAX_SECTION_SIZE = 750 @@ -179,11 +179,12 @@ class CapaExplorerRulegenPreview(QtWidgets.QTextEdit): def __init__(self, parent=None): """ """ - super(CapaExplorerRulegenPreview, self).__init__(parent) + super().__init__(parent) self.setFont(QtGui.QFont("Courier", weight=QtGui.QFont.Bold)) self.setLineWrapMode(QtWidgets.QTextEdit.NoWrap) self.setHorizontalScrollBarPolicy(QtCore.Qt.ScrollBarAsNeeded) + self.setAcceptRichText(False) def reset_view(self): """ """ @@ -284,7 +285,7 @@ class CapaExplorerRulegenPreview(QtWidgets.QTextEdit): self.set_selection(select_start_ppos, select_end_ppos, len(self.toPlainText())) self.verticalScrollBar().setSliderPosition(scroll_ppos) else: - super(CapaExplorerRulegenPreview, self).keyPressEvent(e) + super().keyPressEvent(e) def count_previous_lines_from_block(self, block): """calculate number of lines preceding block""" @@ -310,7 +311,7 @@ class CapaExplorerRulegenEditor(QtWidgets.QTreeWidget): def __init__(self, preview, parent=None): """ """ - super(CapaExplorerRulegenEditor, self).__init__(parent) + super().__init__(parent) self.preview = preview @@ -374,18 +375,18 @@ class CapaExplorerRulegenEditor(QtWidgets.QTreeWidget): def dragMoveEvent(self, e): """ """ - super(CapaExplorerRulegenEditor, self).dragMoveEvent(e) + super().dragMoveEvent(e) def dragEventEnter(self, e): """ """ - super(CapaExplorerRulegenEditor, self).dragEventEnter(e) + super().dragEventEnter(e) def dropEvent(self, e): """ """ if not self.indexAt(e.pos()).isValid(): return - super(CapaExplorerRulegenEditor, self).dropEvent(e) + super().dropEvent(e) self.update_preview() expand_tree(self.invisibleRootItem()) @@ -784,7 +785,7 @@ class CapaExplorerRulegenEditor(QtWidgets.QTreeWidget): class CapaExplorerRulegenFeatures(QtWidgets.QTreeWidget): def __init__(self, editor, parent=None): """ """ - super(CapaExplorerRulegenFeatures, self).__init__(parent) + super().__init__(parent) self.parent_items = {} self.editor = editor @@ -1012,8 +1013,10 @@ class CapaExplorerRulegenFeatures(QtWidgets.QTreeWidget): self.parent_items = {} def format_address(e): - assert isinstance(e, Address) - return "%X" % e if not isinstance(e, _NoAddress) else "" + if isinstance(e, AbsoluteVirtualAddress): + return "%X" % int(e) + else: + return "" def format_feature(feature): """ """ @@ -1072,7 +1075,7 @@ class CapaExplorerQtreeView(QtWidgets.QTreeView): def __init__(self, model, parent=None): """initialize view""" - super(CapaExplorerQtreeView, self).__init__(parent) + super().__init__(parent) self.setModel(model) diff --git a/capa/main.py b/capa/main.py index 2ef66ee7..ee356384 100644 --- a/capa/main.py +++ b/capa/main.py @@ -65,7 +65,7 @@ from capa.features.common import ( FORMAT_DOTNET, FORMAT_FREEZE, ) -from capa.features.address import NO_ADDRESS +from capa.features.address import NO_ADDRESS, Address from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor RULES_PATH_DEFAULT_STRING = "(embedded rules)" @@ -330,7 +330,7 @@ def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalon logger.warning("-" * 80) for line in file_limitation_rule.meta.get("description", "").split("\n"): - logger.warning(" " + line) + logger.warning(" %s", line) logger.warning(" Identified via rule: %s", file_limitation_rule.name) if is_standalone: logger.warning(" ") @@ -431,7 +431,7 @@ def get_default_signatures() -> List[str]: logger.debug("signatures path: %s", sigs_path) ret = [] - for root, dirs, files in os.walk(sigs_path): + for root, _, files in os.walk(sigs_path): for file in files: if not (file.endswith(".pat") or file.endswith(".pat.gz") or file.endswith(".sig")): continue @@ -459,6 +459,7 @@ def get_workspace(path, format_, sigpaths): # lazy import enables us to not require viv if user wants SMDA, for example. import viv_utils + import viv_utils.flirt logger.debug("generating vivisect workspace for: %s", path) # TODO should not be auto at this point, anymore @@ -571,7 +572,7 @@ def get_rules(rule_paths: List[str], disable_progress=False) -> List[Rule]: rule_file_paths.append(rule_path) elif os.path.isdir(rule_path): logger.debug("reading rules from directory %s", rule_path) - for root, dirs, files in os.walk(rule_path): + for root, _, files in os.walk(rule_path): if ".git" in root: # the .github directory contains CI config in capa-rules # this includes some .yml files @@ -622,7 +623,7 @@ def get_signatures(sigs_path): paths.append(sigs_path) elif os.path.isdir(sigs_path): logger.debug("reading signatures from directory %s", os.path.abspath(os.path.normpath(sigs_path))) - for root, dirs, files in os.walk(sigs_path): + for root, _, files in os.walk(sigs_path): for file in files: if file.endswith((".pat", ".pat.gz", ".sig")): sig_path = os.path.join(root, file) @@ -701,8 +702,8 @@ def compute_layout(rules, extractor, capabilities): otherwise, we may pollute the json document with a large amount of un-referenced data. """ - functions_by_bb = {} - bbs_by_function = {} + functions_by_bb: Dict[Address, Address] = {} + bbs_by_function: Dict[Address, List[Address]] = {} for f in extractor.get_functions(): bbs_by_function[f.address] = [] for bb in extractor.get_basic_blocks(f): @@ -713,7 +714,7 @@ def compute_layout(rules, extractor, capabilities): for rule_name, matches in capabilities.items(): rule = rules[rule_name] if rule.meta.get("scope") == capa.rules.BASIC_BLOCK_SCOPE: - for (addr, match) in matches: + for (addr, _) in matches: assert addr in functions_by_bb matched_bbs.add(addr) @@ -999,15 +1000,14 @@ def main(argv=None): return E_INVALID_FILE_TYPE try: - rules = get_rules(args.rules, disable_progress=args.quiet) - rules = capa.rules.RuleSet(rules) + rules = capa.rules.RuleSet(get_rules(args.rules, disable_progress=args.quiet)) logger.debug( "successfully loaded %s rules", # during the load of the RuleSet, we extract subscope statements into their own rules # that are subsequently `match`ed upon. this inflates the total rule count. # so, filter out the subscope rules when reporting total number of loaded rules. - len([i for i in filter(lambda r: not r.is_subscope_rule(), rules.rules.values())]), + len(list(filter(lambda r: not r.is_subscope_rule(), rules.rules.values()))), ) if args.tag: rules = rules.filter_rules_by_meta(args.tag) @@ -1018,12 +1018,12 @@ def main(argv=None): except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: logger.error("%s", str(e)) logger.error( - "Please ensure you're using the rules that correspond to your major version of capa (%s)", - capa.version.get_major_version(), + "Make sure your file directory contains properly formatted capa rules. You can download the standard " + "collection of capa rules from https://github.com/mandiant/capa-rules/releases." ) logger.error( - "You can check out these rules with the following command:\n %s", - capa.version.get_rules_checkout_command(), + "Please ensure you're using the rules that correspond to your major version of capa (%s)", + capa.version.get_major_version(), ) logger.error( "Or, for more details, see the rule set documentation here: %s", @@ -1150,8 +1150,7 @@ def ida_main(): rules_path = os.path.join(get_default_root(), "rules") logger.debug("rule path: %s", rules_path) - rules = get_rules(rules_path) - rules = capa.rules.RuleSet(rules) + rules = capa.rules.RuleSet(get_rules([rules_path])) meta = capa.ida.helpers.collect_metadata([rules_path]) diff --git a/capa/optimizer.py b/capa/optimizer.py index 0408bf07..997abd6c 100644 --- a/capa/optimizer.py +++ b/capa/optimizer.py @@ -47,7 +47,7 @@ def optimize_statement(statement): if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)): # has .children - statement.children = sorted(statement.children, key=lambda n: get_node_cost(n)) + statement.children = sorted(statement.children, key=get_node_cost) return elif isinstance(statement, (ceng.Not, ceng.Range)): # has .child diff --git a/capa/perf.py b/capa/perf.py index cb0e89ec..54575e99 100644 --- a/capa/perf.py +++ b/capa/perf.py @@ -1,8 +1,8 @@ +import typing import collections -from typing import Dict # this structure is unstable and may change before the next major release. -counters: Dict[str, int] = collections.Counter() +counters: typing.Counter[str] = collections.Counter() def reset(): diff --git a/capa/render/result_document.py b/capa/render/result_document.py index 6ae18ead..71bdb6bd 100644 --- a/capa/render/result_document.py +++ b/capa/render/result_document.py @@ -15,6 +15,7 @@ import capa.engine import capa.features.common import capa.features.freeze as frz import capa.features.address +import capa.features.freeze.features as frzf from capa.rules import RuleSet from capa.engine import MatchResults from capa.helpers import assert_never @@ -99,27 +100,27 @@ class Metadata(FrozenModel): rules=meta["analysis"]["rules"], base_address=frz.Address.from_capa(meta["analysis"]["base_address"]), layout=Layout( - functions=[ + functions=tuple( FunctionLayout( address=frz.Address.from_capa(address), - matched_basic_blocks=[ + matched_basic_blocks=tuple( BasicBlockLayout(address=frz.Address.from_capa(bb)) for bb in f["matched_basic_blocks"] - ], + ), ) for address, f in meta["analysis"]["layout"]["functions"].items() - ] + ) ), feature_counts=FeatureCounts( file=meta["analysis"]["feature_counts"]["file"], - functions=[ + functions=tuple( FunctionFeatureCount(address=frz.Address.from_capa(address), count=count) for address, count in meta["analysis"]["feature_counts"]["functions"].items() - ], + ), ), - library_functions=[ + library_functions=tuple( LibraryFunction(address=frz.Address.from_capa(address), name=name) for address, name in meta["analysis"]["library_functions"].items() - ], + ), ), ) @@ -137,18 +138,18 @@ class StatementModel(FrozenModel): class CompoundStatement(StatementModel): type: str - description: Optional[str] + description: Optional[str] = None class SomeStatement(StatementModel): type = "some" - description: Optional[str] + description: Optional[str] = None count: int class RangeStatement(StatementModel): type = "range" - description: Optional[str] + description: Optional[str] = None min: int max: int child: frz.Feature @@ -156,7 +157,7 @@ class RangeStatement(StatementModel): class SubscopeStatement(StatementModel): type = "subscope" - description: Optional[str] + description: Optional[str] = None scope: capa.rules.Scope @@ -277,7 +278,7 @@ class Match(BaseModel): # finally, splice that logic into this tree. if ( isinstance(node, FeatureNode) - and isinstance(node.feature, frz.features.MatchFeature) + and isinstance(node.feature, frzf.MatchFeature) # only add subtree on success, # because there won't be results for the other rule on failure. and success @@ -359,14 +360,14 @@ class Match(BaseModel): def parse_parts_id(s: str): - id = "" + id_ = "" parts = s.split("::") if len(parts) > 0: last = parts.pop() - last, _, id = last.rpartition(" ") - id = id.lstrip("[").rstrip("]") + last, _, id_ = last.rpartition(" ") + id_ = id_.lstrip("[").rstrip("]") parts.append(last) - return parts, id + return tuple(parts), id_ class AttackSpec(FrozenModel): @@ -392,7 +393,7 @@ class AttackSpec(FrozenModel): tactic = "" technique = "" subtechnique = "" - parts, id = parse_parts_id(s) + parts, id_ = parse_parts_id(s) if len(parts) > 0: tactic = parts[0] if len(parts) > 1: @@ -405,7 +406,7 @@ class AttackSpec(FrozenModel): tactic=tactic, technique=technique, subtechnique=subtechnique, - id=id, + id=id_, ) @@ -432,7 +433,7 @@ class MBCSpec(FrozenModel): objective = "" behavior = "" method = "" - parts, id = parse_parts_id(s) + parts, id_ = parse_parts_id(s) if len(parts) > 0: objective = parts[0] if len(parts) > 1: @@ -445,7 +446,7 @@ class MBCSpec(FrozenModel): objective=objective, behavior=behavior, method=method, - id=id, + id=id_, ) @@ -532,10 +533,10 @@ class ResultDocument(BaseModel): rule_matches[rule_name] = RuleMatches( meta=RuleMetadata.from_capa(rule), source=rule.definition, - matches=[ + matches=tuple( (frz.Address.from_capa(addr), Match.from_capa(rules, capabilities, match)) for addr, match in matches - ], + ), ) return ResultDocument(meta=Metadata.from_capa(meta), rules=rule_matches) diff --git a/capa/render/utils.py b/capa/render/utils.py index 97185a66..2cf480c9 100644 --- a/capa/render/utils.py +++ b/capa/render/utils.py @@ -24,14 +24,6 @@ def bold2(s: str) -> str: return termcolor.colored(s, "green") -def hex(n: int) -> str: - """render the given number using upper case hex, like: 0x123ABC""" - if n < 0: - return "-0x%X" % (-n) - else: - return "0x%X" % n - - def format_parts_id(data: Union[rd.AttackSpec, rd.MBCSpec]): """ format canonical representation of ATT&CK/MBC parts and ID diff --git a/capa/render/verbose.py b/capa/render/verbose.py index 6bdeefda..5a225460 100644 --- a/capa/render/verbose.py +++ b/capa/render/verbose.py @@ -23,13 +23,11 @@ Unless required by applicable law or agreed to in writing, software distributed See the License for the specific language governing permissions and limitations under the License. """ import tabulate -import dnfile.mdtable -import dncil.clr.token import capa.rules +import capa.helpers import capa.render.utils as rutils import capa.features.freeze as frz -import capa.render.result_document import capa.render.result_document as rd from capa.rules import RuleSet from capa.engine import MatchResults @@ -37,16 +35,16 @@ from capa.engine import MatchResults def format_address(address: frz.Address) -> str: if address.type == frz.AddressType.ABSOLUTE: - return rutils.hex(address.value) + return capa.helpers.hex(address.value) elif address.type == frz.AddressType.RELATIVE: - return f"base address+{rutils.hex(address.value)}" + return f"base address+{capa.helpers.hex(address.value)}" elif address.type == frz.AddressType.FILE: - return f"file+{rutils.hex(address.value)}" + return f"file+{capa.helpers.hex(address.value)}" elif address.type == frz.AddressType.DN_TOKEN: - return f"token({rutils.hex(address.value)})" + return f"token({capa.helpers.hex(address.value)})" elif address.type == frz.AddressType.DN_TOKEN_OFFSET: token, offset = address.value - return f"token({rutils.hex(token)})+{rutils.hex(offset)}" + return f"token({capa.helpers.hex(token)})+{capa.helpers.hex(offset)}" elif address.type == frz.AddressType.NO_ADDRESS: return "global" else: diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index 6411da22..5950275a 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -6,11 +6,12 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. -from typing import Dict, List, Iterable +from typing import Dict, Iterable import tabulate import capa.rules +import capa.helpers import capa.render.utils as rutils import capa.render.verbose import capa.features.common @@ -128,7 +129,11 @@ def render_feature(ostream, match: rd.Match, feature: frzf.Feature, indent=0): ostream.write(" " * indent) key = feature.type - if isinstance(feature, frzf.ImportFeature): + if isinstance(feature, frzf.BasicBlockFeature): + # i don't think it makes sense to have standalone basic block features. + # we don't parse them from rules, only things like: `count(basic block) > 1` + raise ValueError("cannot render basic block feature directly") + elif isinstance(feature, frzf.ImportFeature): # fixup access to Python reserved name value = feature.import_ elif isinstance(feature, frzf.ClassFeature): @@ -140,23 +145,28 @@ def render_feature(ostream, match: rd.Match, feature: frzf.Feature, indent=0): if value is None: raise ValueError("%s contains None" % key) - if key not in ("regex", "substring"): + if not isinstance(feature, (frzf.RegexFeature, frzf.SubstringFeature)): # like: # number: 10 = SOME_CONSTANT @ 0x401000 - if key == "string": + if isinstance(feature, frzf.StringFeature): value = render_string_value(value) - if key == "number": + elif isinstance( + feature, (frzf.NumberFeature, frzf.OffsetFeature, frzf.OperandNumberFeature, frzf.OperandOffsetFeature) + ): assert isinstance(value, int) - value = hex(value) + value = capa.helpers.hex(value) - ostream.write(key) + if isinstance(feature, frzf.PropertyFeature) and feature.access is not None: + key = f"property/{feature.access}" - if isinstance(feature, frzf.PropertyFeature): - if feature.access is not None: - ostream.write("/" + feature.access) + elif isinstance(feature, frzf.OperandNumberFeature): + key = f"operand[{feature.index}].number" - ostream.write(": ") + elif isinstance(feature, frzf.OperandOffsetFeature): + key = f"operand[{feature.index}].offset" + + ostream.write(f"{key}: ") if value: ostream.write(rutils.bold2(value)) @@ -165,7 +175,7 @@ def render_feature(ostream, match: rd.Match, feature: frzf.Feature, indent=0): ostream.write(capa.rules.DESCRIPTION_SEPARATOR) ostream.write(feature.description) - if key not in ("os", "arch"): + if not isinstance(feature, (frzf.OSFeature, frzf.ArchFeature, frzf.FormatFeature)): render_locations(ostream, match.locations) ostream.write("\n") else: diff --git a/capa/rules.py b/capa/rules.py index 5da1f312..c4d2ad77 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -158,7 +158,7 @@ SUPPORTED_FEATURES[FUNCTION_SCOPE].update(SUPPORTED_FEATURES[BASIC_BLOCK_SCOPE]) class InvalidRule(ValueError): def __init__(self, msg): - super(InvalidRule, self).__init__() + super().__init__() self.msg = msg def __str__(self): @@ -170,7 +170,7 @@ class InvalidRule(ValueError): class InvalidRuleWithPath(InvalidRule): def __init__(self, path, msg): - super(InvalidRuleWithPath, self).__init__(msg) + super().__init__(msg) self.path = path self.msg = msg self.__cause__ = None @@ -181,7 +181,7 @@ class InvalidRuleWithPath(InvalidRule): class InvalidRuleSet(ValueError): def __init__(self, msg): - super(InvalidRuleSet, self).__init__() + super().__init__() self.msg = msg def __str__(self): @@ -233,23 +233,23 @@ def parse_range(s: str): min_spec = min_spec.strip() max_spec = max_spec.strip() - min = None + min_ = None if min_spec: - min = parse_int(min_spec) - if min < 0: + min_ = parse_int(min_spec) + if min_ < 0: raise InvalidRule("range min less than zero") - max = None + max_ = None if max_spec: - max = parse_int(max_spec) - if max < 0: + max_ = parse_int(max_spec) + if max_ < 0: raise InvalidRule("range max less than zero") - if min is not None and max is not None: - if max < min: + if min_ is not None and max_ is not None: + if max_ < min_: raise InvalidRule("range max less than min") - return min, max + return min_, max_ def parse_feature(key: str): @@ -539,14 +539,15 @@ def build_statements(d, scope: str): index = key[len("operand[") : -len("].number")] try: index = int(index) - except ValueError: - raise InvalidRule("operand index must be an integer") + except ValueError as e: + raise InvalidRule("operand index must be an integer") from e value, description = parse_description(d[key], key, d.get("description")) + assert isinstance(value, int) try: feature = capa.features.insn.OperandNumber(index, value, description=description) except ValueError as e: - raise InvalidRule(str(e)) + raise InvalidRule(str(e)) from e ensure_feature_valid_for_scope(scope, feature) return feature @@ -554,14 +555,15 @@ def build_statements(d, scope: str): index = key[len("operand[") : -len("].offset")] try: index = int(index) - except ValueError: - raise InvalidRule("operand index must be an integer") + except ValueError as e: + raise InvalidRule("operand index must be an integer") from e value, description = parse_description(d[key], key, d.get("description")) + assert isinstance(value, int) try: feature = capa.features.insn.OperandOffset(index, value, description=description) except ValueError as e: - raise InvalidRule(str(e)) + raise InvalidRule(str(e)) from e ensure_feature_valid_for_scope(scope, feature) return feature @@ -581,7 +583,7 @@ def build_statements(d, scope: str): try: feature = capa.features.insn.Property(value, access=access, description=description) except ValueError as e: - raise InvalidRule(str(e)) + raise InvalidRule(str(e)) from e ensure_feature_valid_for_scope(scope, feature) return feature @@ -591,7 +593,7 @@ def build_statements(d, scope: str): try: feature = Feature(value, description=description) except ValueError as e: - raise InvalidRule(str(e)) + raise InvalidRule(str(e)) from e ensure_feature_valid_for_scope(scope, feature) return feature @@ -606,7 +608,7 @@ def second(s: List[Any]) -> Any: class Rule: def __init__(self, name: str, scope: str, statement: Statement, meta, definition=""): - super(Rule, self).__init__() + super().__init__() self.name = name self.scope = scope self.statement = statement @@ -632,7 +634,7 @@ class Rule: Returns: List[str]: names of rules upon which this rule depends. """ - deps = set([]) + deps: Set[str] = set([]) def rec(statement): if isinstance(statement, capa.features.common.MatchedRule): @@ -649,6 +651,7 @@ class Rule: deps.update(map(lambda r: r.name, namespaces[statement.value])) else: # not a namespace, assume its a rule name. + assert isinstance(statement.value, str) deps.add(statement.value) elif isinstance(statement, ceng.Statement): @@ -664,7 +667,11 @@ class Rule: def _extract_subscope_rules_rec(self, statement): if isinstance(statement, ceng.Statement): # for each child that is a subscope, - for subscope in filter(lambda statement: isinstance(statement, ceng.Subscope), statement.get_children()): + for child in statement.get_children(): + if not isinstance(child, ceng.Subscope): + continue + + subscope = child # create a new rule from it. # the name is a randomly generated, hopefully unique value. @@ -735,7 +742,7 @@ class Rule: return self.statement.evaluate(features, short_circuit=short_circuit) @classmethod - def from_dict(cls, d, definition): + def from_dict(cls, d, definition) -> "Rule": meta = d["rule"]["meta"] name = meta["name"] # if scope is not specified, default to function scope. @@ -769,14 +776,12 @@ class Rule: # prefer to use CLoader to be fast, see #306 # on Linux, make sure you install libyaml-dev or similar # on Windows, get WHLs from pyyaml.org/pypi - loader = yaml.CLoader logger.debug("using libyaml CLoader.") + return yaml.CLoader except: - loader = yaml.Loader logger.debug("unable to import libyaml CLoader, falling back to Python yaml parser.") logger.debug("this will be slower to load rules.") - - return loader + return yaml.Loader @staticmethod def _get_ruamel_yaml_parser(): @@ -788,8 +793,9 @@ class Rule: # use block mode, not inline json-like mode y.default_flow_style = False - # leave quotes unchanged - y.preserve_quotes = True + # leave quotes unchanged. + # manually verified this property exists, even if mypy complains. + y.preserve_quotes = True # type: ignore # indent lists by two spaces below their parent # @@ -800,12 +806,13 @@ class Rule: y.indent(sequence=2, offset=2) # avoid word wrapping - y.width = 4096 + # manually verified this property exists, even if mypy complains. + y.width = 4096 # type: ignore return y @classmethod - def from_yaml(cls, s, use_ruamel=False): + def from_yaml(cls, s, use_ruamel=False) -> "Rule": if use_ruamel: # ruamel enables nice formatting and doc roundtripping with comments doc = cls._get_ruamel_yaml_parser().load(s) @@ -815,7 +822,7 @@ class Rule: return cls.from_dict(doc, s) @classmethod - def from_yaml_file(cls, path, use_ruamel=False): + def from_yaml_file(cls, path, use_ruamel=False) -> "Rule": with open(path, "rb") as f: try: rule = cls.from_yaml(f.read().decode("utf-8"), use_ruamel=use_ruamel) @@ -826,11 +833,11 @@ class Rule: _ = RuleMetadata.from_capa(rule) return rule except InvalidRule as e: - raise InvalidRuleWithPath(path, str(e)) + raise InvalidRuleWithPath(path, str(e)) from e except pydantic.ValidationError as e: - raise InvalidRuleWithPath(path, str(e)) + raise InvalidRuleWithPath(path, str(e)) from e - def to_yaml(self): + def to_yaml(self) -> str: # reformat the yaml document with a common style. # this includes: # - ordering the meta elements @@ -1067,7 +1074,7 @@ class RuleSet: """ def __init__(self, rules: List[Rule]): - super(RuleSet, self).__init__() + super().__init__() ensure_rules_are_unique(rules) @@ -1259,7 +1266,7 @@ class RuleSet: return (easy_rules_by_feature, hard_rules) @staticmethod - def _get_rules_for_scope(rules, scope): + def _get_rules_for_scope(rules, scope) -> List[Rule]: """ given a collection of rules, collect the rules that are needed at the given scope. these rules are ordered topologically. @@ -1267,7 +1274,7 @@ class RuleSet: don't include auto-generated "subscope" rules. we want to include general "lib" rules here - even if they are not dependencies of other rules, see #398 """ - scope_rules = set([]) + scope_rules: Set[Rule] = set([]) # we need to process all rules, not just rules with the given scope. # this is because rules with a higher scope, e.g. file scope, may have subscope rules @@ -1281,7 +1288,7 @@ class RuleSet: return get_rules_with_scope(topologically_order_rules(list(scope_rules)), scope) @staticmethod - def _extract_subscope_rules(rules): + def _extract_subscope_rules(rules) -> List[Rule]: """ process the given sequence of rules. for each one, extract any embedded subscope rules into their own rule. @@ -1317,13 +1324,13 @@ class RuleSet: for k, v in rule.meta.items(): if isinstance(v, str) and tag in v: logger.debug('using rule "%s" and dependencies, found tag in meta.%s: %s', rule.name, k, v) - rules_filtered.update(set(capa.rules.get_rules_and_dependencies(rules, rule.name))) + rules_filtered.update(set(get_rules_and_dependencies(rules, rule.name))) break if isinstance(v, list): for vv in v: if tag in vv: logger.debug('using rule "%s" and dependencies, found tag in meta.%s: %s', rule.name, k, vv) - rules_filtered.update(set(capa.rules.get_rules_and_dependencies(rules, rule.name))) + rules_filtered.update(set(get_rules_and_dependencies(rules, rule.name))) break return RuleSet(list(rules_filtered)) diff --git a/capa/version.py b/capa/version.py index 740d7f28..af021c70 100644 --- a/capa/version.py +++ b/capa/version.py @@ -3,11 +3,3 @@ __version__ = "4.0.1" def get_major_version(): return int(__version__.partition(".")[0]) - - -def get_rules_branch(): - return f"v{get_major_version()}" - - -def get_rules_checkout_command(): - return f"$ git clone https://github.com/mandiant/capa-rules.git -b {get_rules_branch()} /local/path/to/rules" diff --git a/doc/installation.md b/doc/installation.md index 0e455c10..04414062 100644 --- a/doc/installation.md +++ b/doc/installation.md @@ -6,13 +6,11 @@ If you simply want to use capa, use the standalone binaries we host on GitHub: h We use PyInstaller to create these packages. -The capa [README](../README.md#download) also links to nightly builds of standalone binaries from the latest development branch. - ### Linux Standalone installation The Linux Standalone binary has been built using GLIB 2.26. -Consequently it works when using GLIB >= 2.26. -This requirement is satisfied by default in most newer distribution such as Ubuntu >= 18, Debian >= 10, openSUSE >= 15.1 and CentOS >= 8. +Consequently, it works when using GLIB >= 2.26. +This requirement is satisfied by default in newer distribution such as Ubuntu >= 18, Debian >= 10, openSUSE >= 15.1 and CentOS >= 8. But the binary may not work in older distributions. ### MacOS Standalone installation @@ -24,24 +22,27 @@ By default, on MacOS Catalina or greater, Gatekeeper will block execution of the ## Method 2: Using capa as a Python library To install capa as a Python library use `pip` to fetch the `flare-capa` module. -#### *Note*: +### 1. Install capa module +Use `pip` to install the capa module to your local Python environment. This fetches the library code to your computer but does not keep editable source files around for you to hack on. If you'd like to edit the source files, see below. `$ pip install flare-capa` + +#### *Note on capa rules and library identification signatures* This method is appropriate for integrating capa in an existing project. -This technique doesn't pull the default rule set, so you should check it out separately from [capa-rules](https://github.com/mandiant/capa-rules/) and pass the directory to the entrypoint using `-r` or set the rules path in the IDA Pro plugin: +This technique doesn't pull the default rule set. You can obtain rule releases from [capa-rules](https://github.com/mandiant/capa-rules/releases) and pass the directory to the entrypoint using `-r`. In the IDA Pro plugin you need to configure the rules directory path once. ```console -$ git clone https://github.com/mandiant/capa-rules.git -b v3 /local/path/to/rules -$ capa -r /local/path/to/rules suspicious.exe +$ wget https://github.com/mandiant/capa-rules/archive/refs/tags/v4.0.0.zip +$ unzip v4.0.0.zip +$ capa -r /path/to/capa-rules suspicious.exe ``` This technique also doesn't set up the default library identification [signatures](https://github.com/mandiant/capa/tree/master/sigs). You can pass the signature directory using the `-s` argument. For example, to run capa with both a rule path and a signature path: +```console +$ capa -s /path/to/capa-sigs suspicious.exe +``` - capa -r /path/to/capa-rules -s /path/to/capa-sigs suspicious.exe Alternatively, see Method 3 below. -### 1. Install capa module -Use `pip` to install the capa module to your local Python environment. This fetches the library code to your computer but does not keep editable source files around for you to hack on. If you'd like to edit the source files, see below. `$ pip install flare-capa` - ### 2. Use capa You can now import the `capa` module from a Python script or use the IDA Pro plugins from the `capa/ida` directory. For more information please see the [usage](usage.md) documentation. @@ -49,18 +50,20 @@ You can now import the `capa` module from a Python script or use the IDA Pro plu If you'd like to review and modify the capa source code, you'll need to check it out from GitHub and install it locally. By following these instructions, you'll maintain a local directory of source code that you can modify and run easily. ### 1. Check out source code -Next, clone the capa git repository. +Clone the capa git repository. We use submodules to separate [code](https://github.com/mandiant/capa), [rules](https://github.com/mandiant/capa-rules), and [test data](https://github.com/mandiant/capa-testfiles). + To clone everything use the `--recurse-submodules` option: -- CAUTION: The capa testfiles repository contains many malware samples. If you pull down everything using this method, you may want to install to a directory that won't trigger your anti-virus software. +- CAUTION: The capa testfiles repository contains many malware samples. If you pull down everything using this method, you may want to install to a directory that is ignored by your anti-virus software. - `$ git clone --recurse-submodules https://github.com/mandiant/capa.git /local/path/to/src` (HTTPS) - `$ git clone --recurse-submodules git@github.com:mandiant/capa.git /local/path/to/src` (SSH) -To only get the source code and our provided rules (common), follow these steps: +To only get the source code and our provided rules (a more common use-case), follow these steps: - clone repository - `$ git clone https://github.com/mandiant/capa.git /local/path/to/src` (HTTPS) - `$ git clone git@github.com:mandiant/capa.git /local/path/to/src` (SSH) - `$ cd /local/path/to/src` +- initialize the rules submodule and pull rules - `$ git submodule update --init rules` ### 2. Install the local source code @@ -76,8 +79,7 @@ You'll find that the `capa.exe` (Windows) or `capa` (Linux/MacOS) executables in For development, we recommend to use [venv](https://docs.python.org/3/tutorial/venv.html). It allows you to create a virtual environment: a self-contained directory tree that contains a Python installation for a particular version of Python, plus a number of additional packages. This approach avoids conflicts between the requirements of different applications on your computer. It also ensures that you don't overlook to add a new requirement to `setup.up` using a library already installed on your system. -To create an environment (in the parent directory, to avoid commiting it by accident or messing with the linters), run: -`$ python3 -m venv ../capa-env` +To create an environment (in the parent directory, to avoid commiting it by accident or messing with the linters), run: `$ python3 -m venv ../capa-env` To activate `capa-env` in Linux or MacOS, run: `$ source ../capa-env/bin/activate` @@ -90,8 +92,8 @@ For more details about creating and using virtual environments, check out the [v ##### Install development dependencies We use the following tools to ensure consistent code style and formatting: - - [black](https://github.com/psf/black) code formatter, with `-l 120` - - [isort 5](https://pypi.org/project/isort/) code formatter, with `--profile black --length-sort --line-width 120` + - [black](https://github.com/psf/black) code formatter + - [isort 5](https://pypi.org/project/isort/) code formatter - [dos2unix](https://linux.die.net/man/1/dos2unix) for UNIX-style LF newlines - [capafmt](https://github.com/mandiant/capa/blob/master/scripts/capafmt.py) rule formatter @@ -104,7 +106,7 @@ You can run it with the argument `no_tests` to skip the tests and only run the c ##### Setup hooks [optional] -If you plan to contribute to capa, you may want to setup the hooks. +If you plan to contribute to capa, you may want to setup the provided hooks. Run `scripts/setup-hooks.sh` to set the following hooks up: - The `pre-commit` hook runs checks before every `git commit`. It runs `scripts/ci.sh no_tests` aborting the commit if there are code style or rule linter offenses you need to fix. @@ -112,13 +114,17 @@ Run `scripts/setup-hooks.sh` to set the following hooks up: It runs `scripts/ci.sh` aborting the push if there are code style or rule linter offenses or if the tests fail. This way you can ensure everything is alright before sending a pull request. -You can skip the checks by using the `--no-verify` git option. +You can skip the checks by using the `-n`/`--no-verify` git option. ### 3. Compile binary using PyInstaller -We compile capa standalone binaries using PyInstaller. To reproduce the build process check out the source code as described above and follow these steps. +We compile capa standalone binaries using PyInstaller. To reproduce the build process check out the source code as described above and follow the following steps. #### Install PyInstaller: -`$ pip install pyinstaller` (Python 3) +`$ pip install pyinstaller` + +Or install capa with build dependencies: + +`$ pip install -e /local/path/to/src[build]` #### Run Pyinstaller `$ pyinstaller .github/pyinstaller/pyinstaller.spec` diff --git a/doc/release.md b/doc/release.md index cd26fd84..ec950e88 100644 --- a/doc/release.md +++ b/doc/release.md @@ -3,7 +3,7 @@ - [ ] Ensure all [milestoned issues/PRs](https://github.com/mandiant/capa/milestones) are addressed, or reassign to a new milestone. - [ ] Add the `dont merge` label to all PRs that are close to be ready to merge (or merge them if they are ready) in [capa](https://github.com/mandiant/capa/pulls) and [capa-rules](https://github.com/mandiant/capa-rules/pulls). - [ ] Ensure the [CI workflow succeeds in master](https://github.com/mandiant/capa/actions/workflows/tests.yml?query=branch%3Amaster). -- [ ] Ensure that `python scripts/lint.py rules/ --thorough` succeeds (only `missing examples` offenses are allowed in the nursery). +- [ ] Ensure that `python scripts/lint.py rules/ --thorough` succeeds (only `missing examples` offenses are allowed in the nursery). - [ ] Review changes - capa https://github.com/mandiant/capa/compare/\...master - capa-rules https://github.com/mandiant/capa-rules/compare/\\...master @@ -37,13 +37,10 @@ - [ ] Update [capa/version.py](https://github.com/mandiant/capa/blob/master/capa/version.py) - [ ] Create a PR with the updated [CHANGELOG.md](https://github.com/mandiant/capa/blob/master/CHANGELOG.md) and [capa/version.py](https://github.com/mandiant/capa/blob/master/capa/version.py). Copy this checklist in the PR description. - [ ] After PR review, merge the PR and [create the release in GH](https://github.com/mandiant/capa/releases/new) using text from the [CHANGELOG.md](https://github.com/mandiant/capa/blob/master/CHANGELOG.md). -- [ ] Verify GH actions [upload artifacts](https://github.com/mandiant/capa/releases), [publish to PyPI](https://pypi.org/project/flare-capa) and [create a tag in capa rules](https://github.com/mandiant/capa-rules/tags) upon completion. -- [ ] Manually update capa rules major version rule branch - ```commandline - [capa/rules] $ git pull master - [capa/rules] $ git checkout v3 # create if new major version: git checkout -b vX - [capa/rules] $ git merge master - [capa/rules] $ git push origin v3 - ``` +- Verify GH actions + - [ ] [upload artifacts](https://github.com/mandiant/capa/releases) + - [ ] [publish to PyPI](https://pypi.org/project/flare-capa) + - [ ] [create tag in capa rules](https://github.com/mandiant/capa-rules/tags) + - [ ] [create release in capa rules](https://github.com/mandiant/capa-rules/releases) - [ ] [Spread the word](https://twitter.com) - [ ] Update internal service diff --git a/doc/rules.md b/doc/rules.md index be68e00b..170379e0 100644 --- a/doc/rules.md +++ b/doc/rules.md @@ -1,6 +1,5 @@ ### rules - capa uses a collection of rules to identify capabilities within a program. The [github.com/mandiant/capa-rules](https://github.com/mandiant/capa-rules) repository contains hundreds of standard library rules that are distributed with capa. @@ -12,8 +11,8 @@ $ capa suspicious.exe However, you may want to modify the rules for a variety of reasons: - - develop new rules to find behaviors, and/or - - tweak existing rules to reduce false positives, and/or + - develop new rules to find behaviors, + - tweak existing rules to reduce false positives, - collect a private selection of rules not shared publicly. Or, you may want to use capa as a Python library within another application. @@ -21,22 +20,18 @@ Or, you may want to use capa as a Python library within another application. In these scenarios, you must provide the rule set to capa as a directory on your file system. Do this using the `-r`/`--rules` parameter: ```console -$ capa --rules /local/path/to/rules suspicious.exe +$ capa --rules /local/path/to/rules suspicious.exe ``` -You can collect the standard set of rules in two ways: +You can download the standard set of rules as ZIP or TGZ archives from the [capa-rules release page](https://github.com/mandiant/capa-rules/releases). - - [download from the Github releases page](#download-release-archive), or - - [clone from Github](#clone-with-git). - -Note that you must use match the rules major version with the capa major version, -i.e., use `v1` rules with `v1` of capa. +Note that you must use match the rules major version with the capa major version, i.e., use `v1` rules with `v1` of capa. This is so that new versions of capa can update rule syntax, such as by adding new fields and logic. Otherwise, using rules with a mismatched version of capa may lead to errors like: ``` -$ capa --rules /path/to/mismatched/rules suspicious.exe +$ capa --rules /path/to/mismatched/rules suspicious.exe ERROR:lint:invalid rule: injection.yml: invalid rule: unexpected statement: instruction ``` @@ -46,27 +41,3 @@ You can check the version of capa you're currently using like this: $ capa --version capa 3.0.3 ``` - -#### download release archive - -The releases page is [here](https://github.com/mandiant/capa-rules/tags/). -Find the most recent release corresponding to your major version of capa and download the ZIP archive. -Here are some quick links: - - v1: [v1](https://github.com/mandiant/capa-rules/releases/tag/v1) - - v2: [v2](https://github.com/mandiant/capa-rules/releases/tag/v2) - - v3: [v3](https://github.com/mandiant/capa-rules/releases/tag/v3) - -#### clone with git - -To fetch with git, clone the appropriate branch like this: - -```console -$ git clone https://github.com/mandiant/capa-rules.git -b v3 /local/path/to/rules -``` - -Note that the branch name (`v3` in the example above) must match the major version of capa you're using. - - - [v1](https://github.com/mandiant/capa-rules/tree/v1): `v1` - - [v2](https://github.com/mandiant/capa-rules/tree/v2): `v2` - - [v3](https://github.com/mandiant/capa-rules/tree/v3): `v3` - diff --git a/rules b/rules index 793837a4..519b87e4 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit 793837a406e27dd2754202fd13d7dc6ba8397679 +Subproject commit 519b87e44639b463b7b95518e1a998fac8701ec4 diff --git a/scripts/bulk-process.py b/scripts/bulk-process.py index 8ec23903..b57928c6 100644 --- a/scripts/bulk-process.py +++ b/scripts/bulk-process.py @@ -152,8 +152,7 @@ def main(argv=None): capa.main.handle_common_args(args) try: - rules = capa.main.get_rules(args.rules) - rules = capa.rules.RuleSet(rules) + rules = capa.rules.RuleSet(capa.main.get_rules(args.rules)) logger.info("successfully loaded %s rules", len(rules)) except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: logger.error("%s", str(e)) diff --git a/scripts/capa2yara.py b/scripts/capa2yara.py index 06a1d031..9474347b 100644 --- a/scripts/capa2yara.py +++ b/scripts/capa2yara.py @@ -64,7 +64,6 @@ unsupported = ["characteristic", "mnemonic", "offset", "subscope", "Range"] # collect all converted rules to be able to check if we have needed sub rules for match: converted_rules = [] -count_incomplete = 0 default_tags = "CAPA " @@ -537,7 +536,8 @@ def output_unsupported_capa_rules(yaml, capa_rulename, url, reason): unsupported_capa_rules_names.write(url.encode("utf-8") + b"\n") -def convert_rules(rules, namespaces, cround): +def convert_rules(rules, namespaces, cround, make_priv): + count_incomplete = 0 for rule in rules.rules.values(): rule_name = convert_rule_name(rule.name) @@ -652,7 +652,6 @@ def convert_rules(rules, namespaces, cround): if meta_name and meta_value: yara_meta += "\t" + meta_name + ' = "' + meta_value + '"\n' - rule_name_bonus = "" if rule_comment: yara_meta += '\tcomment = "' + rule_comment + '"\n' yara_meta += '\tdate = "' + today + '"\n' @@ -679,12 +678,13 @@ def convert_rules(rules, namespaces, cround): # TODO: now the rule is finished and could be automatically checked with the capa-testfile(s) named in meta (doing it for all of them using yara-ci upload at the moment) output_yar(yara) converted_rules.append(rule_name) - global count_incomplete count_incomplete += incomplete else: output_unsupported_capa_rules(rule.to_yaml(), rule.name, url, yara_condition) pass + return count_incomplete + def main(argv=None): if argv is None: @@ -696,7 +696,6 @@ def main(argv=None): capa.main.install_common_args(parser, wanted={"tag"}) args = parser.parse_args(args=argv) - global make_priv make_priv = args.private if args.verbose: @@ -710,9 +709,9 @@ def main(argv=None): logging.getLogger("capa2yara").setLevel(level) try: - rules = capa.main.get_rules([args.rules], disable_progress=True) - namespaces = capa.rules.index_rules_by_namespace(list(rules)) - rules = capa.rules.RuleSet(rules) + rules_ = capa.main.get_rules([args.rules], disable_progress=True) + namespaces = capa.rules.index_rules_by_namespace(rules_) + rules = capa.rules.RuleSet(rules_) logger.info("successfully loaded %s rules (including subscope rules which will be ignored)", len(rules)) if args.tag: rules = rules.filter_rules_by_meta(args.tag) @@ -745,14 +744,15 @@ def main(argv=None): # do several rounds of converting rules because some rules for match: might not be converted in the 1st run num_rules = 9999999 cround = 0 + count_incomplete = 0 while num_rules != len(converted_rules) or cround < min_rounds: cround += 1 logger.info("doing convert_rules(), round: " + str(cround)) num_rules = len(converted_rules) - convert_rules(rules, namespaces, cround) + count_incomplete += convert_rules(rules, namespaces, cround, make_priv) # one last round to collect all unconverted rules - convert_rules(rules, namespaces, 9000) + count_incomplete += convert_rules(rules, namespaces, 9000, make_priv) stats = "\n// converted rules : " + str(len(converted_rules)) stats += "\n// among those are incomplete : " + str(count_incomplete) diff --git a/scripts/capa_as_library.py b/scripts/capa_as_library.py index 682d4dc6..2db6a644 100644 --- a/scripts/capa_as_library.py +++ b/scripts/capa_as_library.py @@ -172,7 +172,7 @@ def capa_details(rules_path, file_path, output_format="dictionary"): meta["analysis"].update(counts) meta["analysis"]["layout"] = capa.main.compute_layout(rules, extractor, capabilities) - capa_output = False + capa_output: Any = False if output_format == "dictionary": # ...as python dictionary, simplified as textable but in dictionary doc = rd.ResultDocument.from_capa(meta, rules, capabilities) diff --git a/scripts/detect-elf-os.py b/scripts/detect-elf-os.py index 63186ed8..078b80dd 100644 --- a/scripts/detect-elf-os.py +++ b/scripts/detect-elf-os.py @@ -28,7 +28,7 @@ def main(argv=None): if capa.helpers.is_runtime_ida(): from capa.ida.helpers import IDAIO - f: BinaryIO = IDAIO() + f: BinaryIO = IDAIO() # type: ignore else: if argv is None: diff --git a/scripts/lint.py b/scripts/lint.py index 467ad3cf..cd6e32cb 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -248,7 +248,7 @@ class InvalidAttckOrMbcTechnique(Lint): """ def __init__(self): - super(InvalidAttckOrMbcTechnique, self).__init__() + super().__init__() try: with open(f"{os.path.dirname(__file__)}/linter-data.json", "rb") as fd: @@ -902,11 +902,15 @@ def redirecting_print_to_tqdm(): old_print(*args, **kwargs) try: - # Globaly replace print with new_print - inspect.builtins.print = new_print + # Globaly replace print with new_print. + # Verified this works manually on Python 3.11: + # >>> import inspect + # >>> inspect.builtins + # + inspect.builtins.print = new_print # type: ignore yield finally: - inspect.builtins.print = old_print + inspect.builtins.print = old_print # type: ignore def lint(ctx: Context): @@ -998,10 +1002,8 @@ def main(argv=None): time0 = time.time() try: - rules = capa.main.get_rules(args.rules, disable_progress=True) - rule_count = len(rules) - rules = capa.rules.RuleSet(rules) - logger.info("successfully loaded %s rules", rule_count) + rules = capa.rules.RuleSet(capa.main.get_rules(args.rules, disable_progress=True)) + logger.info("successfully loaded %s rules", len(rules)) if args.tag: rules = rules.filter_rules_by_meta(args.tag) logger.debug("selected %s rules", len(rules)) diff --git a/scripts/setup-linter-dependencies.py b/scripts/setup-linter-dependencies.py index 326a684c..021c0e31 100644 --- a/scripts/setup-linter-dependencies.py +++ b/scripts/setup-linter-dependencies.py @@ -151,7 +151,7 @@ class MbcExtractor(MitreExtractor): def _get_tactics(self) -> List[Dict]: """Override _get_tactics to edit the tactic name for Micro-objective""" - tactics = super(MbcExtractor, self)._get_tactics() + tactics = super()._get_tactics() # We don't want the Micro-objective string inside objective names for tactic in tactics: tactic["name"] = tactic["name"].replace(" Micro-objective", "") diff --git a/scripts/show-capabilities-by-function.py b/scripts/show-capabilities-by-function.py index 0c5ff361..d1773021 100644 --- a/scripts/show-capabilities-by-function.py +++ b/scripts/show-capabilities-by-function.py @@ -141,8 +141,7 @@ def main(argv=None): return -1 try: - rules = capa.main.get_rules(args.rules) - rules = capa.rules.RuleSet(rules) + rules = capa.rules.RuleSet(capa.main.get_rules(args.rules)) logger.info("successfully loaded %s rules", len(rules)) if args.tag: rules = rules.filter_rules_by_meta(args.tag) diff --git a/scripts/show-features.py b/scripts/show-features.py index f07dcf75..d23a9a0a 100644 --- a/scripts/show-features.py +++ b/scripts/show-features.py @@ -79,6 +79,7 @@ import capa.exceptions import capa.render.verbose as v import capa.features.common import capa.features.freeze +import capa.features.address import capa.features.extractors.base_extractor from capa.helpers import log_unsupported_runtime_error @@ -108,7 +109,7 @@ def main(argv=None): try: sig_paths = capa.main.get_signatures(args.signatures) - except (IOError) as e: + except IOError as e: logger.error("%s", str(e)) return -1 @@ -135,7 +136,7 @@ def main(argv=None): for feature, addr in extractor.extract_file_features(): print("file: %s: %s" % (format_address(addr), feature)) - function_handles = extractor.get_functions() + function_handles = tuple(extractor.get_functions()) if args.function: if args.format == "freeze": @@ -172,7 +173,7 @@ def ida_main(): print("file: %s: %s" % (format_address(addr), feature)) return - function_handles = extractor.get_functions() + function_handles = tuple(extractor.get_functions()) if function: function_handles = tuple(filter(lambda fh: fh.inner.start_ea == function, function_handles)) diff --git a/setup.cfg b/setup.cfg index 5e0292f4..87eef850 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,3 +19,10 @@ test = pytest ignore = E203, E302, E402, E501, E712, E722, E731, W291, W503 max-line-length = 180 statistics = True + + +[pylint.FORMAT] +max-line-length = 180 + +[pylint] +disable = missing-docstring,invalid-name,import-outside-toplevel,redefined-outer-name,consider-using-f-string \ No newline at end of file diff --git a/setup.py b/setup.py index 09f21119..7d3eefb9 100644 --- a/setup.py +++ b/setup.py @@ -18,15 +18,15 @@ requirements = [ "termcolor==2.1.1", "wcwidth==0.2.5", "ida-settings==2.1.0", - "viv-utils[flirt]==0.7.5", + "viv-utils[flirt]==0.7.7", "halo==0.0.31", "networkx==2.5.1", "ruamel.yaml==0.17.21", "vivisect==1.0.8", "pefile==2022.5.30", "pyelftools==0.29", - "dnfile==0.12.0", - "dncil==1.0.1", + "dnfile==0.13.0", + "dncil==1.0.2", "pydantic==1.10.2", ] @@ -72,10 +72,10 @@ setuptools.setup( "pytest-sugar==0.9.4", "pytest-instafail==0.4.2", "pytest-cov==4.0.0", - "pycodestyle==2.9.1", - "black==22.10.0", - "isort==5.10.1", - "mypy==0.982", + "pycodestyle==2.10.0", + "black==22.12.0", + "isort==5.11.4", + "mypy==0.991", "psutil==5.9.2", "stix2==3.0.1", "requests==2.28.0", @@ -89,7 +89,7 @@ setuptools.setup( "types_requests==2.28.1", ], "build": [ - "pyinstaller==5.5", + "pyinstaller==5.7.0", ], }, zip_safe=False, diff --git a/tests/data b/tests/data index 23f114a2..4784dee3 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 23f114a2e2996d33e0bc9caf98f58b0c5bb0ade1 +Subproject commit 4784dee36e23a68c98bab09ec5b21cc7d16e84ff diff --git a/tests/fixtures.py b/tests/fixtures.py index 921c2865..6dbc9817 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -263,12 +263,20 @@ def get_data_path_by_name(name): return os.path.join(DNFILE_TESTFILES, "hello-world", "hello-world.exe") elif name.startswith("_1c444"): return os.path.join(CD, "data", "dotnet", "1c444ebeba24dcba8628b7dfe5fec7c6.exe_") + elif name.startswith("_387f15"): + return os.path.join( + CD, "data", "dotnet", "387f15043f0198fd3a637b0758c2b6dde9ead795c3ed70803426fc355731b173.dll_" + ) elif name.startswith("_692f"): return os.path.join(CD, "data", "dotnet", "692f7fd6d198e804d6af98eb9e390d61.exe_") elif name.startswith("_0953c"): return os.path.join(CD, "data", "0953cc3b77ed2974b09e3a00708f88de931d681e2d0cb64afbaf714610beabe6.exe_") elif name.startswith("_039a6"): return os.path.join(CD, "data", "039a6336d0802a2255669e6867a5679c7eb83313dbc61fb1c7232147379bd304.exe_") + elif name.startswith("b5f052"): + return os.path.join(CD, "data", "b5f0524e69b3a3cf636c7ac366ca57bf5e3a8fdc8a9f01caf196c611a7918a87.elf_") + elif name.startswith("bf7a9c"): + return os.path.join(CD, "data", "bf7a9c8bdfa6d47e01ad2b056264acc3fd90cf43fe0ed8deec93ab46b47d76cb.elf_") else: raise ValueError("unexpected sample fixture: %s" % name) @@ -730,8 +738,23 @@ FEATURE_PRESENCE_TESTS_DOTNET = sorted( ("_1c444", "function=0x1F68", capa.features.insn.API("GetWindowDC"), True), ("_1c444", "function=0x1F68", capa.features.insn.API("user32.GetWindowDC"), True), ("_1c444", "function=0x1F68", capa.features.insn.Number(0xCC0020), True), + ("_1c444", "token=0x600001D", capa.features.common.Characteristic("calls to"), True), + ("_1c444", "token=0x6000018", capa.features.common.Characteristic("calls to"), False), + ("_1c444", "token=0x600001D", capa.features.common.Characteristic("calls from"), True), + ("_1c444", "token=0x600000F", capa.features.common.Characteristic("calls from"), False), ("_1c444", "function=0x1F68", capa.features.insn.Number(0x0), True), ("_1c444", "function=0x1F68", capa.features.insn.Number(0x1), False), + ("_692f", "token=0x6000004", capa.features.insn.API("System.Linq.Enumerable::First"), True), # generic method + ( + "_692f", + "token=0x6000004", + capa.features.insn.Property("System.Linq.Enumerable::First"), + False, + ), # generic method + ("_692f", "token=0x6000004", capa.features.common.Namespace("System.Linq"), True), # generic method + ("_692f", "token=0x6000004", capa.features.common.Class("System.Linq.Enumerable"), True), # generic method + ("_1c444", "token=0x6000020", capa.features.common.Namespace("Reqss"), True), # ldftn + ("_1c444", "token=0x6000020", capa.features.common.Class("Reqss.Reqss"), True), # ldftn ( "_1c444", "function=0x1F59, bb=0x1F59, insn=0x1F5B", @@ -753,25 +776,25 @@ FEATURE_PRESENCE_TESTS_DOTNET = sorted( "token=0x600002B", capa.features.insn.Property("System.IO.FileInfo::Length", access=FeatureAccess.READ), True, - ), # MemberRef method + ), # MemberRef property access ( "_1c444", "token=0x600002B", capa.features.insn.Property("System.IO.FileInfo::Length"), True, - ), # MemberRef method + ), # MemberRef property access ( "_1c444", "token=0x6000081", capa.features.insn.API("System.Diagnostics.Process::Start"), True, - ), # MemberRef method + ), # MemberRef property access ( "_1c444", "token=0x6000081", capa.features.insn.Property( "System.Diagnostics.ProcessStartInfo::UseShellExecute", access=FeatureAccess.WRITE - ), # MemberRef method + ), # MemberRef property access True, ), ( @@ -779,7 +802,7 @@ FEATURE_PRESENCE_TESTS_DOTNET = sorted( "token=0x6000081", capa.features.insn.Property( "System.Diagnostics.ProcessStartInfo::WorkingDirectory", access=FeatureAccess.WRITE - ), # MemberRef method + ), # MemberRef property access True, ), ( @@ -787,41 +810,96 @@ FEATURE_PRESENCE_TESTS_DOTNET = sorted( "token=0x6000081", capa.features.insn.Property( "System.Diagnostics.ProcessStartInfo::FileName", access=FeatureAccess.WRITE - ), # MemberRef method + ), # MemberRef property access True, ), ( "_1c444", "token=0x6000087", - capa.features.insn.Property("Sockets.MySocket::reConnectionDelay", access=FeatureAccess.WRITE), # Field + capa.features.insn.Property( + "Sockets.MySocket::reConnectionDelay", access=FeatureAccess.WRITE + ), # Field property access True, ), ( "_1c444", "token=0x600008A", - capa.features.insn.Property("Sockets.MySocket::isConnected", access=FeatureAccess.WRITE), # Field + capa.features.insn.Property( + "Sockets.MySocket::isConnected", access=FeatureAccess.WRITE + ), # Field property access True, ), ( "_1c444", "token=0x600008A", - capa.features.insn.Property("Sockets.MySocket::onConnected", access=FeatureAccess.READ), # Field + capa.features.common.Class("Sockets.MySocket"), # Field property access + True, + ), + ( + "_1c444", + "token=0x600008A", + capa.features.common.Namespace("Sockets"), # Field property access + True, + ), + ( + "_1c444", + "token=0x600008A", + capa.features.insn.Property( + "Sockets.MySocket::onConnected", access=FeatureAccess.READ + ), # Field property access True, ), ( "_0953c", "token=0x6000004", - capa.features.insn.Property("System.Diagnostics.Debugger::IsAttached", access=FeatureAccess.READ), + capa.features.insn.Property( + "System.Diagnostics.Debugger::IsAttached", access=FeatureAccess.READ + ), # MemberRef property access True, - ), # MemberRef method + ), + ( + "_0953c", + "token=0x6000004", + capa.features.common.Class("System.Diagnostics.Debugger"), # MemberRef property access + True, + ), + ( + "_0953c", + "token=0x6000004", + capa.features.common.Namespace("System.Diagnostics"), # MemberRef property access + True, + ), ( "_692f", "token=0x6000006", capa.features.insn.Property( "System.Management.Automation.PowerShell::Streams", access=FeatureAccess.READ - ), # MemberRef method + ), # MemberRef property access False, ), + ( + "_387f15", + "token=0x600009E", + capa.features.insn.Property( + "Modulo.IqQzcRDvSTulAhyLtZHqyeYGgaXGbuLwhxUKXYmhtnOmgpnPJDTSIPhYPpnE::geoplugin_countryCode", + access=FeatureAccess.READ, + ), # MethodDef property access + True, + ), + ( + "_387f15", + "token=0x600009E", + capa.features.common.Class( + "Modulo.IqQzcRDvSTulAhyLtZHqyeYGgaXGbuLwhxUKXYmhtnOmgpnPJDTSIPhYPpnE" + ), # MethodDef property access + True, + ), + ( + "_387f15", + "token=0x600009E", + capa.features.common.Namespace("Modulo"), # MethodDef property access + True, + ), ( "_039a6", "token=0x6000007", @@ -869,7 +947,10 @@ FEATURE_COUNT_TESTS = [ ] -FEATURE_COUNT_TESTS_DOTNET = [] # type: ignore +FEATURE_COUNT_TESTS_DOTNET = [ + ("_1c444", "token=0x600001D", capa.features.common.Characteristic("calls to"), 1), + ("_1c444", "token=0x600001D", capa.features.common.Characteristic("calls from"), 9), +] def do_test_feature_presence(get_extractor, sample, scope, feature, expected): diff --git a/tests/test_engine.py b/tests/test_engine.py index 26bb59ce..89c3b739 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -5,61 +5,72 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import capa.features.address from capa.engine import * from capa.features import * from capa.features.insn import * +ADDR1 = capa.features.address.AbsoluteVirtualAddress(0x401001) +ADDR2 = capa.features.address.AbsoluteVirtualAddress(0x401002) +ADDR3 = capa.features.address.AbsoluteVirtualAddress(0x401003) +ADDR4 = capa.features.address.AbsoluteVirtualAddress(0x401004) + def test_number(): - assert Number(1).evaluate({Number(0): {1}}) == False - assert Number(1).evaluate({Number(1): {1}}) == True - assert Number(1).evaluate({Number(2): {1, 2}}) == False + assert Number(1).evaluate({Number(0): {ADDR1}}) == False + assert Number(1).evaluate({Number(1): {ADDR1}}) == True + assert Number(1).evaluate({Number(2): {ADDR1, ADDR2}}) == False def test_and(): - assert And([Number(1)]).evaluate({Number(0): {1}}) == False - assert And([Number(1)]).evaluate({Number(1): {1}}) == True - assert And([Number(1), Number(2)]).evaluate({Number(0): {1}}) == False - assert And([Number(1), Number(2)]).evaluate({Number(1): {1}}) == False - assert And([Number(1), Number(2)]).evaluate({Number(2): {1}}) == False - assert And([Number(1), Number(2)]).evaluate({Number(1): {1}, Number(2): {2}}) == True + assert And([Number(1)]).evaluate({Number(0): {ADDR1}}) == False + assert And([Number(1)]).evaluate({Number(1): {ADDR1}}) == True + assert And([Number(1), Number(2)]).evaluate({Number(0): {ADDR1}}) == False + assert And([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}}) == False + assert And([Number(1), Number(2)]).evaluate({Number(2): {ADDR1}}) == False + assert And([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}, Number(2): {ADDR2}}) == True def test_or(): - assert Or([Number(1)]).evaluate({Number(0): {1}}) == False - assert Or([Number(1)]).evaluate({Number(1): {1}}) == True - assert Or([Number(1), Number(2)]).evaluate({Number(0): {1}}) == False - assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True - assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}) == True - assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}, Number(2): {2}}) == True + assert Or([Number(1)]).evaluate({Number(0): {ADDR1}}) == False + assert Or([Number(1)]).evaluate({Number(1): {ADDR1}}) == True + assert Or([Number(1), Number(2)]).evaluate({Number(0): {ADDR1}}) == False + assert Or([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}}) == True + assert Or([Number(1), Number(2)]).evaluate({Number(2): {ADDR1}}) == True + assert Or([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}, Number(2): {ADDR2}}) == True def test_not(): - assert Not(Number(1)).evaluate({Number(0): {1}}) == True - assert Not(Number(1)).evaluate({Number(1): {1}}) == False + assert Not(Number(1)).evaluate({Number(0): {ADDR1}}) == True + assert Not(Number(1)).evaluate({Number(1): {ADDR1}}) == False def test_some(): - assert Some(0, [Number(1)]).evaluate({Number(0): {1}}) == True - assert Some(1, [Number(1)]).evaluate({Number(0): {1}}) == False + assert Some(0, [Number(1)]).evaluate({Number(0): {ADDR1}}) == True + assert Some(1, [Number(1)]).evaluate({Number(0): {ADDR1}}) == False - assert Some(2, [Number(1), Number(2), Number(3)]).evaluate({Number(0): {1}}) == False - assert Some(2, [Number(1), Number(2), Number(3)]).evaluate({Number(0): {1}, Number(1): {1}}) == False - assert Some(2, [Number(1), Number(2), Number(3)]).evaluate({Number(0): {1}, Number(1): {1}, Number(2): {1}}) == True + assert Some(2, [Number(1), Number(2), Number(3)]).evaluate({Number(0): {ADDR1}}) == False + assert Some(2, [Number(1), Number(2), Number(3)]).evaluate({Number(0): {ADDR1}, Number(1): {ADDR1}}) == False assert ( Some(2, [Number(1), Number(2), Number(3)]).evaluate( - {Number(0): {1}, Number(1): {1}, Number(2): {1}, Number(3): {1}} + {Number(0): {ADDR1}, Number(1): {ADDR1}, Number(2): {ADDR1}} + ) + == True + ) + assert ( + Some(2, [Number(1), Number(2), Number(3)]).evaluate( + {Number(0): {ADDR1}, Number(1): {ADDR1}, Number(2): {ADDR1}, Number(3): {ADDR1}} ) == True ) assert ( Some(2, [Number(1), Number(2), Number(3)]).evaluate( { - Number(0): {1}, - Number(1): {1}, - Number(2): {1}, - Number(3): {1}, - Number(4): {1}, + Number(0): {ADDR1}, + Number(1): {ADDR1}, + Number(2): {ADDR1}, + Number(3): {ADDR1}, + Number(4): {ADDR1}, } ) == True @@ -69,10 +80,10 @@ def test_some(): def test_complex(): assert True == Or( [And([Number(1), Number(2)]), Or([Number(3), Some(2, [Number(4), Number(5), Number(6)])])] - ).evaluate({Number(5): {1}, Number(6): {1}, Number(7): {1}, Number(8): {1}}) + ).evaluate({Number(5): {ADDR1}, Number(6): {ADDR1}, Number(7): {ADDR1}, Number(8): {ADDR1}}) assert False == Or([And([Number(1), Number(2)]), Or([Number(3), Some(2, [Number(4), Number(5)])])]).evaluate( - {Number(5): {1}, Number(6): {1}, Number(7): {1}, Number(8): {1}} + {Number(5): {ADDR1}, Number(6): {ADDR1}, Number(7): {ADDR1}, Number(8): {ADDR1}} ) @@ -83,54 +94,54 @@ def test_range(): # unbounded range with matching feature should always match assert Range(Number(1)).evaluate({Number(1): {}}) == True - assert Range(Number(1)).evaluate({Number(1): {0}}) == True + assert Range(Number(1)).evaluate({Number(1): {ADDR1}}) == True # unbounded max - assert Range(Number(1), min=1).evaluate({Number(1): {0}}) == True - assert Range(Number(1), min=2).evaluate({Number(1): {0}}) == False - assert Range(Number(1), min=2).evaluate({Number(1): {0, 1}}) == True + assert Range(Number(1), min=1).evaluate({Number(1): {ADDR1}}) == True + assert Range(Number(1), min=2).evaluate({Number(1): {ADDR1}}) == False + assert Range(Number(1), min=2).evaluate({Number(1): {ADDR1, ADDR2}}) == True # unbounded min - assert Range(Number(1), max=0).evaluate({Number(1): {0}}) == False - assert Range(Number(1), max=1).evaluate({Number(1): {0}}) == True - assert Range(Number(1), max=2).evaluate({Number(1): {0}}) == True - assert Range(Number(1), max=2).evaluate({Number(1): {0, 1}}) == True - assert Range(Number(1), max=2).evaluate({Number(1): {0, 1, 3}}) == False + assert Range(Number(1), max=0).evaluate({Number(1): {ADDR1}}) == False + assert Range(Number(1), max=1).evaluate({Number(1): {ADDR1}}) == True + assert Range(Number(1), max=2).evaluate({Number(1): {ADDR1}}) == True + assert Range(Number(1), max=2).evaluate({Number(1): {ADDR1, ADDR2}}) == True + assert Range(Number(1), max=2).evaluate({Number(1): {ADDR1, ADDR2, ADDR3}}) == False # we can do an exact match by setting min==max assert Range(Number(1), min=1, max=1).evaluate({Number(1): {}}) == False - assert Range(Number(1), min=1, max=1).evaluate({Number(1): {1}}) == True - assert Range(Number(1), min=1, max=1).evaluate({Number(1): {1, 2}}) == False + assert Range(Number(1), min=1, max=1).evaluate({Number(1): {ADDR1}}) == True + assert Range(Number(1), min=1, max=1).evaluate({Number(1): {ADDR1, ADDR2}}) == False # bounded range assert Range(Number(1), min=1, max=3).evaluate({Number(1): {}}) == False - assert Range(Number(1), min=1, max=3).evaluate({Number(1): {1}}) == True - assert Range(Number(1), min=1, max=3).evaluate({Number(1): {1, 2}}) == True - assert Range(Number(1), min=1, max=3).evaluate({Number(1): {1, 2, 3}}) == True - assert Range(Number(1), min=1, max=3).evaluate({Number(1): {1, 2, 3, 4}}) == False + assert Range(Number(1), min=1, max=3).evaluate({Number(1): {ADDR1}}) == True + assert Range(Number(1), min=1, max=3).evaluate({Number(1): {ADDR1, ADDR2}}) == True + assert Range(Number(1), min=1, max=3).evaluate({Number(1): {ADDR1, ADDR2, ADDR3}}) == True + assert Range(Number(1), min=1, max=3).evaluate({Number(1): {ADDR1, ADDR2, ADDR3, ADDR4}}) == False def test_short_circuit(): - assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True + assert Or([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}}) == True # with short circuiting, only the children up until the first satisfied child are captured. - assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}, short_circuit=True).children) == 1 - assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}, short_circuit=False).children) == 2 + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}}, short_circuit=True).children) == 1 + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}}, short_circuit=False).children) == 2 def test_eval_order(): # base cases. - assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True - assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}) == True + assert Or([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}}) == True + assert Or([Number(1), Number(2)]).evaluate({Number(2): {ADDR1}}) == True # with short circuiting, only the children up until the first satisfied child are captured. - assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children) == 1 - assert len(Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children) == 2 - assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}, Number(2): {1}}).children) == 1 + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}}).children) == 1 + assert len(Or([Number(1), Number(2)]).evaluate({Number(2): {ADDR1}}).children) == 2 + assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}, Number(2): {ADDR1}}).children) == 1 # and its guaranteed that children are evaluated in order. - assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children[0].statement == Number(1) - assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children[0].statement != Number(2) + assert Or([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}}).children[0].statement == Number(1) + assert Or([Number(1), Number(2)]).evaluate({Number(1): {ADDR1}}).children[0].statement != Number(2) - assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children[1].statement == Number(2) - assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children[1].statement != Number(1) + assert Or([Number(1), Number(2)]).evaluate({Number(2): {ADDR1}}).children[1].statement == Number(2) + assert Or([Number(1), Number(2)]).evaluate({Number(2): {ADDR1}}).children[1].statement != Number(1) diff --git a/tests/test_fmt.py b/tests/test_fmt.py index de96a1f4..1f37886c 100644 --- a/tests/test_fmt.py +++ b/tests/test_fmt.py @@ -98,7 +98,7 @@ def test_rule_reformat_order(): def test_rule_reformat_meta_update(): # test updating the rule content after parsing - rule = textwrap.dedent( + src = textwrap.dedent( """ rule: meta: @@ -116,7 +116,7 @@ def test_rule_reformat_meta_update(): """ ) - rule = capa.rules.Rule.from_yaml(rule) + rule = capa.rules.Rule.from_yaml(src) rule.name = "test rule" assert rule.to_yaml() == EXPECTED diff --git a/tests/test_match.py b/tests/test_match.py index 6fb319cd..2d8b9f2a 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -218,7 +218,7 @@ def test_match_matched_rules(): # the ordering of the rules must not matter, # the engine should match rules in an appropriate order. features, _ = match( - capa.rules.topologically_order_rules(reversed(rules)), + capa.rules.topologically_order_rules(list(reversed(rules))), {capa.features.insn.Number(100): {1}}, 0x0, ) diff --git a/tests/test_os_detection.py b/tests/test_os_detection.py index e2f850d7..82e592a7 100644 --- a/tests/test_os_detection.py +++ b/tests/test_os_detection.py @@ -14,13 +14,49 @@ from fixtures import * import capa.features.extractors.elf -def test_elf_section_gnu_abi_tag(): +def test_elf_sh_notes(): + # guess: osabi: None + # guess: ph notes: None + # guess: sh notes: OS.LINUX + # guess: linker: None + # guess: ABI versions needed: None + # guess: needed dependencies: None path = get_data_path_by_name("2f7f5f") with open(path, "rb") as f: assert capa.features.extractors.elf.detect_elf_os(f) == "linux" -def test_elf_program_header_gnu_abi_tag(): +def test_elf_pt_notes(): + # guess: osabi: None + # guess: ph notes: None + # guess: sh notes: OS.LINUX + # guess: linker: OS.LINUX + # guess: ABI versions needed: OS.LINUX + # guess: needed dependencies: None path = get_data_path_by_name("7351f.elf") with open(path, "rb") as f: assert capa.features.extractors.elf.detect_elf_os(f) == "linux" + + +def test_elf_so_needed(): + # guess: osabi: None + # guess: ph notes: None + # guess: sh notes: OS.HURD + # guess: linker: None + # guess: ABI versions needed: OS.HURD + # guess: needed dependencies: OS.HURD + path = get_data_path_by_name("b5f052") + with open(path, "rb") as f: + assert capa.features.extractors.elf.detect_elf_os(f) == "hurd" + + +def test_elf_abi_version_hurd(): + # guess: osabi: None + # guess: ph notes: None + # guess: sh notes: OS.HURD + # guess: linker: None + # guess: ABI versions needed: OS.HURD + # guess: needed dependencies: None + path = get_data_path_by_name("bf7a9c") + with open(path, "rb") as f: + assert capa.features.extractors.elf.detect_elf_os(f) == "hurd" diff --git a/tests/test_render.py b/tests/test_render.py index fff14a95..9277b9f2 100644 --- a/tests/test_render.py +++ b/tests/test_render.py @@ -1,10 +1,18 @@ import textwrap +import fixtures + import capa.rules import capa.render.utils +import capa.features.file import capa.features.insn import capa.features.common +import capa.features.freeze +import capa.render.vverbose +import capa.features.address +import capa.features.basicblock import capa.render.result_document +import capa.features.freeze.features def test_render_number(): @@ -92,3 +100,53 @@ def test_render_meta_mbc(): assert mbc.method == method assert capa.render.utils.format_parts_id(mbc) == canonical + + +@fixtures.parametrize( + "feature,expected", + [ + (capa.features.common.OS("windows"), "os: windows"), + (capa.features.common.Arch("i386"), "arch: i386"), + (capa.features.common.Format("pe"), "format: pe"), + (capa.features.common.MatchedRule("foo"), "match: foo @ 0x401000"), + (capa.features.common.Characteristic("foo"), "characteristic: foo @ 0x401000"), + (capa.features.file.Export("SvcMain"), "export: SvcMain @ 0x401000"), + (capa.features.file.Import("CreateFileW"), "import: CreateFileW @ 0x401000"), + (capa.features.file.Section(".detours"), "section: .detours @ 0x401000"), + (capa.features.file.FunctionName("memcmp"), "function name: memcmp @ 0x401000"), + (capa.features.common.Substring("foo"), "substring: foo"), + (capa.features.common.Regex("^foo"), "regex: ^foo"), + (capa.features.common.String("foo"), 'string: "foo" @ 0x401000'), + (capa.features.common.Class("BeanFactory"), "class: BeanFactory @ 0x401000"), + (capa.features.common.Namespace("std::enterprise"), "namespace: std::enterprise @ 0x401000"), + (capa.features.insn.API("CreateFileW"), "api: CreateFileW @ 0x401000"), + (capa.features.insn.Property("foo"), "property: foo @ 0x401000"), + (capa.features.insn.Property("foo", "read"), "property/read: foo @ 0x401000"), + (capa.features.insn.Property("foo", "write"), "property/write: foo @ 0x401000"), + (capa.features.insn.Number(12), "number: 0xC @ 0x401000"), + (capa.features.common.Bytes(b"AAAA"), "bytes: 41414141 @ 0x401000"), + (capa.features.insn.Offset(12), "offset: 0xC @ 0x401000"), + (capa.features.insn.Mnemonic("call"), "mnemonic: call @ 0x401000"), + (capa.features.insn.OperandNumber(0, 12), "operand[0].number: 0xC @ 0x401000"), + (capa.features.insn.OperandOffset(0, 12), "operand[0].offset: 0xC @ 0x401000"), + # unsupported + # (capa.features.basicblock.BasicBlock(), "basic block @ 0x401000"), + ], +) +def test_render_vverbose_feature(feature, expected): + ostream = capa.render.utils.StringIO() + + addr = capa.features.freeze.Address.from_capa(capa.features.address.AbsoluteVirtualAddress(0x401000)) + feature = capa.features.freeze.features.feature_from_capa(feature) + + matches = capa.render.result_document.Match( + success=True, + node=capa.render.result_document.FeatureNode(feature=feature), + children=(), + locations=(addr,), + captures={}, + ) + + capa.render.vverbose.render_feature(ostream, matches, feature, indent=0) + + assert ostream.getvalue().strip() == expected diff --git a/tests/test_result_document.py b/tests/test_result_document.py index 8074e1cd..b98fadff 100644 --- a/tests/test_result_document.py +++ b/tests/test_result_document.py @@ -19,6 +19,7 @@ def test_optional_node_from_capa(): [], ) ) + assert isinstance(node, rdoc.StatementNode) assert isinstance(node.statement, rdoc.CompoundStatement) assert node.statement.type == rdoc.CompoundStatementType.OPTIONAL @@ -32,6 +33,7 @@ def test_some_node_from_capa(): ], ) ) + assert isinstance(node, rdoc.StatementNode) assert isinstance(node.statement, rdoc.SomeStatement) @@ -41,6 +43,7 @@ def test_range_node_from_capa(): capa.features.insn.Number(0), ) ) + assert isinstance(node, rdoc.StatementNode) assert isinstance(node.statement, rdoc.RangeStatement) @@ -51,6 +54,7 @@ def test_subscope_node_from_capa(): capa.features.insn.Number(0), ) ) + assert isinstance(node, rdoc.StatementNode) assert isinstance(node.statement, rdoc.SubscopeStatement) @@ -62,6 +66,7 @@ def test_and_node_from_capa(): ], ) ) + assert isinstance(node, rdoc.StatementNode) assert isinstance(node.statement, rdoc.CompoundStatement) assert node.statement.type == rdoc.CompoundStatementType.AND @@ -74,6 +79,7 @@ def test_or_node_from_capa(): ], ) ) + assert isinstance(node, rdoc.StatementNode) assert isinstance(node.statement, rdoc.CompoundStatement) assert node.statement.type == rdoc.CompoundStatementType.OR @@ -86,115 +92,138 @@ def test_not_node_from_capa(): ], ) ) + assert isinstance(node, rdoc.StatementNode) assert isinstance(node.statement, rdoc.CompoundStatement) assert node.statement.type == rdoc.CompoundStatementType.NOT def test_os_node_from_capa(): node = rdoc.node_from_capa(capa.features.common.OS("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.OSFeature) def test_arch_node_from_capa(): node = rdoc.node_from_capa(capa.features.common.Arch("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.ArchFeature) def test_format_node_from_capa(): node = rdoc.node_from_capa(capa.features.common.Format("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.FormatFeature) def test_match_node_from_capa(): node = rdoc.node_from_capa(capa.features.common.MatchedRule("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.MatchFeature) def test_characteristic_node_from_capa(): node = rdoc.node_from_capa(capa.features.common.Characteristic("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.CharacteristicFeature) def test_substring_node_from_capa(): node = rdoc.node_from_capa(capa.features.common.Substring("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.SubstringFeature) def test_regex_node_from_capa(): node = rdoc.node_from_capa(capa.features.common.Regex("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.RegexFeature) def test_class_node_from_capa(): node = rdoc.node_from_capa(capa.features.common.Class("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.ClassFeature) def test_namespace_node_from_capa(): node = rdoc.node_from_capa(capa.features.common.Namespace("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.NamespaceFeature) def test_bytes_node_from_capa(): node = rdoc.node_from_capa(capa.features.common.Bytes(b"")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.BytesFeature) def test_export_node_from_capa(): node = rdoc.node_from_capa(capa.features.file.Export("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.ExportFeature) def test_import_node_from_capa(): node = rdoc.node_from_capa(capa.features.file.Import("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.ImportFeature) def test_section_node_from_capa(): node = rdoc.node_from_capa(capa.features.file.Section("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.SectionFeature) def test_function_name_node_from_capa(): node = rdoc.node_from_capa(capa.features.file.FunctionName("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.FunctionNameFeature) def test_api_node_from_capa(): node = rdoc.node_from_capa(capa.features.insn.API("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.APIFeature) def test_property_node_from_capa(): node = rdoc.node_from_capa(capa.features.insn.Property("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.PropertyFeature) def test_number_node_from_capa(): node = rdoc.node_from_capa(capa.features.insn.Number(0)) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.NumberFeature) def test_offset_node_from_capa(): node = rdoc.node_from_capa(capa.features.insn.Offset(0)) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.OffsetFeature) def test_mnemonic_node_from_capa(): node = rdoc.node_from_capa(capa.features.insn.Mnemonic("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.MnemonicFeature) def test_operand_number_node_from_capa(): node = rdoc.node_from_capa(capa.features.insn.OperandNumber(0, 0)) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.OperandNumberFeature) def test_operand_offset_node_from_capa(): node = rdoc.node_from_capa(capa.features.insn.OperandOffset(0, 0)) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.OperandOffsetFeature) def test_basic_block_node_from_capa(): node = rdoc.node_from_capa(capa.features.basicblock.BasicBlock("")) + assert isinstance(node, rdoc.FeatureNode) assert isinstance(node.feature, frzf.BasicBlockFeature) diff --git a/tests/test_rules.py b/tests/test_rules.py index 61bef111..466ac306 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -13,6 +13,8 @@ import pytest import capa.rules import capa.engine import capa.features.common +import capa.features.address +from capa.engine import Or from capa.features.file import FunctionName from capa.features.insn import Number, Offset, Property from capa.features.common import ( @@ -29,12 +31,18 @@ from capa.features.common import ( Substring, FeatureAccess, ) +from capa.features.address import AbsoluteVirtualAddress + +ADDR1 = capa.features.address.AbsoluteVirtualAddress(0x401001) +ADDR2 = capa.features.address.AbsoluteVirtualAddress(0x401002) +ADDR3 = capa.features.address.AbsoluteVirtualAddress(0x401003) +ADDR4 = capa.features.address.AbsoluteVirtualAddress(0x401004) def test_rule_ctor(): - r = capa.rules.Rule("test rule", capa.rules.FUNCTION_SCOPE, Number(1), {}) - assert r.evaluate({Number(0): {1}}) == False - assert r.evaluate({Number(1): {1}}) == True + r = capa.rules.Rule("test rule", capa.rules.FUNCTION_SCOPE, Or([Number(1)]), {}) + assert r.evaluate({Number(0): {ADDR1}}) == False + assert r.evaluate({Number(1): {ADDR2}}) == True def test_rule_yaml(): @@ -56,10 +64,10 @@ def test_rule_yaml(): """ ) r = capa.rules.Rule.from_yaml(rule) - assert r.evaluate({Number(0): {1}}) == False - assert r.evaluate({Number(0): {1}, Number(1): {1}}) == False - assert r.evaluate({Number(0): {1}, Number(1): {1}, Number(2): {1}}) == True - assert r.evaluate({Number(0): {1}, Number(1): {1}, Number(2): {1}, Number(3): {1}}) == True + assert r.evaluate({Number(0): {ADDR1}}) == False + assert r.evaluate({Number(0): {ADDR1}, Number(1): {ADDR1}}) == False + assert r.evaluate({Number(0): {ADDR1}, Number(1): {ADDR1}, Number(2): {ADDR1}}) == True + assert r.evaluate({Number(0): {ADDR1}, Number(1): {ADDR1}, Number(2): {ADDR1}, Number(3): {ADDR1}}) == True def test_rule_yaml_complex(): @@ -82,8 +90,8 @@ def test_rule_yaml_complex(): """ ) r = capa.rules.Rule.from_yaml(rule) - assert r.evaluate({Number(5): {1}, Number(6): {1}, Number(7): {1}, Number(8): {1}}) == True - assert r.evaluate({Number(6): {1}, Number(7): {1}, Number(8): {1}}) == False + assert r.evaluate({Number(5): {ADDR1}, Number(6): {ADDR1}, Number(7): {ADDR1}, Number(8): {ADDR1}}) == True + assert r.evaluate({Number(6): {ADDR1}, Number(7): {ADDR1}, Number(8): {ADDR1}}) == False def test_rule_descriptions(): @@ -160,8 +168,8 @@ def test_rule_yaml_not(): """ ) r = capa.rules.Rule.from_yaml(rule) - assert r.evaluate({Number(1): {1}}) == True - assert r.evaluate({Number(1): {1}, Number(2): {1}}) == False + assert r.evaluate({Number(1): {ADDR1}}) == True + assert r.evaluate({Number(1): {ADDR1}, Number(2): {ADDR1}}) == False def test_rule_yaml_count(): @@ -175,9 +183,9 @@ def test_rule_yaml_count(): """ ) r = capa.rules.Rule.from_yaml(rule) - assert r.evaluate({Number(100): {}}) == False - assert r.evaluate({Number(100): {1}}) == True - assert r.evaluate({Number(100): {1, 2}}) == False + assert r.evaluate({Number(100): set()}) == False + assert r.evaluate({Number(100): {ADDR1}}) == True + assert r.evaluate({Number(100): {ADDR1, ADDR2}}) == False def test_rule_yaml_count_range(): @@ -191,10 +199,10 @@ def test_rule_yaml_count_range(): """ ) r = capa.rules.Rule.from_yaml(rule) - assert r.evaluate({Number(100): {}}) == False - assert r.evaluate({Number(100): {1}}) == True - assert r.evaluate({Number(100): {1, 2}}) == True - assert r.evaluate({Number(100): {1, 2, 3}}) == False + assert r.evaluate({Number(100): set()}) == False + assert r.evaluate({Number(100): {ADDR1}}) == True + assert r.evaluate({Number(100): {ADDR1, ADDR2}}) == True + assert r.evaluate({Number(100): {ADDR1, ADDR2, ADDR3}}) == False def test_rule_yaml_count_string(): @@ -208,10 +216,10 @@ def test_rule_yaml_count_string(): """ ) r = capa.rules.Rule.from_yaml(rule) - assert r.evaluate({String("foo"): {}}) == False - assert r.evaluate({String("foo"): {1}}) == False - assert r.evaluate({String("foo"): {1, 2}}) == True - assert r.evaluate({String("foo"): {1, 2, 3}}) == False + assert r.evaluate({String("foo"): set()}) == False + assert r.evaluate({String("foo"): {ADDR1}}) == False + assert r.evaluate({String("foo"): {ADDR1, ADDR2}}) == True + assert r.evaluate({String("foo"): {ADDR1, ADDR2, ADDR3}}) == False def test_invalid_rule_feature(): @@ -481,11 +489,11 @@ def test_count_number_symbol(): """ ) r = capa.rules.Rule.from_yaml(rule) - assert r.evaluate({Number(2): {}}) == False - assert r.evaluate({Number(2): {1}}) == True - assert r.evaluate({Number(2): {1, 2}}) == False - assert r.evaluate({Number(0x100, description="symbol name"): {1}}) == False - assert r.evaluate({Number(0x100, description="symbol name"): {1, 2, 3}}) == True + assert r.evaluate({Number(2): set()}) == False + assert r.evaluate({Number(2): {ADDR1}}) == True + assert r.evaluate({Number(2): {ADDR1, ADDR2}}) == False + assert r.evaluate({Number(0x100, description="symbol name"): {ADDR1}}) == False + assert r.evaluate({Number(0x100, description="symbol name"): {ADDR1, ADDR2, ADDR3}}) == True def test_invalid_number(): @@ -567,11 +575,11 @@ def test_count_offset_symbol(): """ ) r = capa.rules.Rule.from_yaml(rule) - assert r.evaluate({Offset(2): {}}) == False - assert r.evaluate({Offset(2): {1}}) == True - assert r.evaluate({Offset(2): {1, 2}}) == False - assert r.evaluate({Offset(0x100, description="symbol name"): {1}}) == False - assert r.evaluate({Offset(0x100, description="symbol name"): {1, 2, 3}}) == True + assert r.evaluate({Offset(2): set()}) == False + assert r.evaluate({Offset(2): {ADDR1}}) == True + assert r.evaluate({Offset(2): {ADDR1, ADDR2}}) == False + assert r.evaluate({Offset(0x100, description="symbol name"): {ADDR1}}) == False + assert r.evaluate({Offset(0x100, description="symbol name"): {ADDR1, ADDR2, ADDR3}}) == True def test_invalid_offset(): @@ -966,10 +974,10 @@ def test_property_access(): """ ) ) - assert r.evaluate({Property("System.IO.FileInfo::Length", access=FeatureAccess.READ): {1}}) == True + assert r.evaluate({Property("System.IO.FileInfo::Length", access=FeatureAccess.READ): {ADDR1}}) == True - assert r.evaluate({Property("System.IO.FileInfo::Length"): {1}}) == False - assert r.evaluate({Property("System.IO.FileInfo::Length", access=FeatureAccess.WRITE): {1}}) == False + assert r.evaluate({Property("System.IO.FileInfo::Length"): {ADDR1}}) == False + assert r.evaluate({Property("System.IO.FileInfo::Length", access=FeatureAccess.WRITE): {ADDR1}}) == False def test_property_access_symbol(): @@ -986,7 +994,7 @@ def test_property_access_symbol(): ) assert ( r.evaluate( - {Property("System.IO.FileInfo::Length", access=FeatureAccess.READ, description="some property"): {1}} + {Property("System.IO.FileInfo::Length", access=FeatureAccess.READ, description="some property"): {ADDR1}} ) == True )