Add the ability to select which functions or processes you which to extract capabilities from (#2156)

This commit is contained in:
Yacine
2024-08-20 13:09:46 +01:00
committed by GitHub
parent c409b2b7ed
commit 791f5e2359
6 changed files with 180 additions and 7 deletions

View File

@@ -3,7 +3,7 @@
## master (unreleased) ## master (unreleased)
### New Features ### New Features
- cli: add the ability to select which specific functions or processes to analyze @yelhamer
- webui: explore capa analysis results in a web-based UI online and offline #2224 @s-ff - webui: explore capa analysis results in a web-based UI online and offline #2224 @s-ff
- support analyzing DRAKVUF traces #2143 @yelhamer - support analyzing DRAKVUF traces #2143 @yelhamer
- IDA extractor: extract names from dynamically resolved APIs stored in renamed global variables #2201 @Ana06 - IDA extractor: extract names from dynamically resolved APIs stored in renamed global variables #2201 @Ana06

View File

@@ -23,3 +23,15 @@ class UnsupportedOSError(ValueError):
class EmptyReportError(ValueError): class EmptyReportError(ValueError):
pass pass
class InvalidArgument(ValueError):
pass
class NonExistantFunctionError(ValueError):
pass
class NonExistantProcessError(ValueError):
pass

View File

@@ -9,7 +9,9 @@
import abc import abc
import hashlib import hashlib
import dataclasses import dataclasses
from typing import Any, Dict, Tuple, Union, Iterator from copy import copy
from types import MethodType
from typing import Any, Set, Dict, Tuple, Union, Iterator
from dataclasses import dataclass from dataclasses import dataclass
# TODO(williballenthin): use typing.TypeAlias directly when Python 3.9 is deprecated # TODO(williballenthin): use typing.TypeAlias directly when Python 3.9 is deprecated
@@ -296,6 +298,22 @@ class StaticFeatureExtractor:
raise NotImplementedError() raise NotImplementedError()
def FunctionFilter(extractor: StaticFeatureExtractor, functions: Set) -> StaticFeatureExtractor:
original_get_functions = extractor.get_functions
def filtered_get_functions(self):
yield from (f for f in original_get_functions() if f.address in functions)
# we make a copy of the original extractor object and then update its get_functions() method with the decorated filter one.
# this is in order to preserve the original extractor object's get_functions() method, in case it is used elsewhere in the code.
# an example where this is important is in our testfiles where we may use the same extractor object with different tests,
# with some of these tests needing to install a functions filter on the extractor object.
new_extractor = copy(extractor)
new_extractor.get_functions = MethodType(filtered_get_functions, extractor) # type: ignore
return new_extractor
@dataclass @dataclass
class ProcessHandle: class ProcessHandle:
""" """
@@ -467,4 +485,20 @@ class DynamicFeatureExtractor:
raise NotImplementedError() raise NotImplementedError()
def ProcessFilter(extractor: DynamicFeatureExtractor, processes: Set) -> DynamicFeatureExtractor:
original_get_processes = extractor.get_processes
def filtered_get_processes(self):
yield from (f for f in original_get_processes() if f.address.pid in processes)
# we make a copy of the original extractor object and then update its get_processes() method with the decorated filter one.
# this is in order to preserve the original extractor object's get_processes() method, in case it is used elsewhere in the code.
# an example where this is important is in our testfiles where we may use the same extractor object with different tests,
# with some of these tests needing to install a processes filter on the extractor object.
new_extractor = copy(extractor)
new_extractor.get_processes = MethodType(filtered_get_processes, extractor) # type: ignore
return new_extractor
FeatureExtractor: TypeAlias = Union[StaticFeatureExtractor, DynamicFeatureExtractor] FeatureExtractor: TypeAlias = Union[StaticFeatureExtractor, DynamicFeatureExtractor]

View File

@@ -17,7 +17,7 @@ import argparse
import textwrap import textwrap
import contextlib import contextlib
from types import TracebackType from types import TracebackType
from typing import Any, Dict, List, Optional from typing import Any, Set, Dict, List, Optional, TypedDict
from pathlib import Path from pathlib import Path
import colorama import colorama
@@ -62,6 +62,7 @@ from capa.helpers import (
log_unsupported_drakvuf_report_error, log_unsupported_drakvuf_report_error,
) )
from capa.exceptions import ( from capa.exceptions import (
InvalidArgument,
EmptyReportError, EmptyReportError,
UnsupportedOSError, UnsupportedOSError,
UnsupportedArchError, UnsupportedArchError,
@@ -83,9 +84,17 @@ from capa.features.common import (
FORMAT_FREEZE, FORMAT_FREEZE,
FORMAT_RESULT, FORMAT_RESULT,
FORMAT_DRAKVUF, FORMAT_DRAKVUF,
STATIC_FORMATS,
DYNAMIC_FORMATS,
) )
from capa.capabilities.common import find_capabilities, has_file_limitation, find_file_capabilities from capa.capabilities.common import find_capabilities, has_file_limitation, find_file_capabilities
from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor from capa.features.extractors.base_extractor import (
ProcessFilter,
FunctionFilter,
FeatureExtractor,
StaticFeatureExtractor,
DynamicFeatureExtractor,
)
RULES_PATH_DEFAULT_STRING = "(embedded rules)" RULES_PATH_DEFAULT_STRING = "(embedded rules)"
SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)" SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)"
@@ -106,10 +115,17 @@ E_MISSING_CAPE_STATIC_ANALYSIS = 21
E_MISSING_CAPE_DYNAMIC_ANALYSIS = 22 E_MISSING_CAPE_DYNAMIC_ANALYSIS = 22
E_EMPTY_REPORT = 23 E_EMPTY_REPORT = 23
E_UNSUPPORTED_GHIDRA_EXECUTION_MODE = 24 E_UNSUPPORTED_GHIDRA_EXECUTION_MODE = 24
E_INVALID_INPUT_FORMAT = 25
E_INVALID_FEATURE_EXTRACTOR = 26
logger = logging.getLogger("capa") logger = logging.getLogger("capa")
class FilterConfig(TypedDict, total=False):
processes: Set[int]
functions: Set[int]
@contextlib.contextmanager @contextlib.contextmanager
def timing(msg: str): def timing(msg: str):
t0 = time.time() t0 = time.time()
@@ -276,6 +292,22 @@ def install_common_args(parser, wanted=None):
help=f"select backend, {backend_help}", help=f"select backend, {backend_help}",
) )
if "restrict-to-functions" in wanted:
parser.add_argument(
"--restrict-to-functions",
type=lambda s: s.replace(" ", "").split(","),
default=[],
help="provide a list of comma-separated function virtual addresses to analyze (static analysis).",
)
if "restrict-to-processes" in wanted:
parser.add_argument(
"--restrict-to-processes",
type=lambda s: s.replace(" ", "").split(","),
default=[],
help="provide a list of comma-separated process IDs to analyze (dynamic analysis).",
)
if "os" in wanted: if "os" in wanted:
oses = [ oses = [
(OS_AUTO, "detect OS automatically - default"), (OS_AUTO, "detect OS automatically - default"),
@@ -749,9 +781,10 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr
os_ = get_os_from_cli(args, backend) os_ = get_os_from_cli(args, backend)
sample_path = get_sample_path_from_cli(args, backend) sample_path = get_sample_path_from_cli(args, backend)
extractor_filters = get_extractor_filters_from_cli(args, input_format)
try: try:
return capa.loader.get_extractor( extractor = capa.loader.get_extractor(
args.input_file, args.input_file,
input_format, input_format,
os_, os_,
@@ -761,6 +794,7 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr
disable_progress=args.quiet or args.debug, disable_progress=args.quiet or args.debug,
sample_path=sample_path, sample_path=sample_path,
) )
return apply_extractor_filters(extractor, extractor_filters)
except UnsupportedFormatError as e: except UnsupportedFormatError as e:
if input_format == FORMAT_CAPE: if input_format == FORMAT_CAPE:
log_unsupported_cape_report_error(str(e)) log_unsupported_cape_report_error(str(e))
@@ -780,6 +814,38 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr
raise ShouldExitError(E_CORRUPT_FILE) from e raise ShouldExitError(E_CORRUPT_FILE) from e
def get_extractor_filters_from_cli(args, input_format) -> FilterConfig:
if not hasattr(args, "restrict_to_processes") and not hasattr(args, "restrict_to_functions"):
# no processes or function filters were installed in the args
return {}
if input_format in STATIC_FORMATS:
if args.restrict_to_processes:
raise InvalidArgument("Cannot filter processes with static analysis.")
return {"functions": {int(addr, 0) for addr in args.restrict_to_functions}}
elif input_format in DYNAMIC_FORMATS:
if args.restrict_to_functions:
raise InvalidArgument("Cannot filter functions with dynamic analysis.")
return {"processes": {int(pid, 0) for pid in args.restrict_to_processes}}
else:
raise ShouldExitError(E_INVALID_INPUT_FORMAT)
def apply_extractor_filters(extractor: FeatureExtractor, extractor_filters: FilterConfig):
if not any(extractor_filters.values()):
return extractor
# if the user specified extractor filters, then apply them here
if isinstance(extractor, StaticFeatureExtractor):
assert extractor_filters["functions"]
return FunctionFilter(extractor, extractor_filters["functions"])
elif isinstance(extractor, DynamicFeatureExtractor):
assert extractor_filters["processes"]
return ProcessFilter(extractor, extractor_filters["processes"])
else:
raise ShouldExitError(E_INVALID_FEATURE_EXTRACTOR)
def main(argv: Optional[List[str]] = None): def main(argv: Optional[List[str]] = None):
if sys.version_info < (3, 8): if sys.version_info < (3, 8):
raise UnsupportedRuntimeError("This version of capa can only be used with Python 3.8+") raise UnsupportedRuntimeError("This version of capa can only be used with Python 3.8+")
@@ -819,7 +885,20 @@ def main(argv: Optional[List[str]] = None):
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description=desc, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter description=desc, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter
) )
install_common_args(parser, {"input_file", "format", "backend", "os", "signatures", "rules", "tag"}) install_common_args(
parser,
{
"input_file",
"format",
"backend",
"os",
"signatures",
"rules",
"tag",
"restrict-to-functions",
"restrict-to-processes",
},
)
parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text") parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text")
args = parser.parse_args(args=argv) args = parser.parse_args(args=argv)

View File

@@ -9,6 +9,22 @@ Use the `-t` option to run rules with the given metadata value (see the rule fie
For example, `capa -t william.ballenthin@mandiant.com` runs rules that reference Willi's email address (probably as the author), or For example, `capa -t william.ballenthin@mandiant.com` runs rules that reference Willi's email address (probably as the author), or
`capa -t communication` runs rules with the namespace `communication`. `capa -t communication` runs rules with the namespace `communication`.
### only analyze selected functions
Use the `--restrict-to-functions` option to extract capabilities from only a selected set of functions. This is useful for analyzing
large functions and figuring out their capabilities and their address of occurance; for example: PEB access, RC4 encryption, etc.
To use this, you can copy the virtual addresses from your favorite disassembler and pass them to capa as follows:
`capa sample.exe --restrict-to-functions 0x4019C0,0x401CD0`. If you add the `-v` option then capa will extract the interesting parts of a function for you.
### only analyze selected processes
Use the `--restrict-to-processes` option to extract capabilities from only a selected set of processes. This is useful for filtering the noise
generated from analyzing non-malicious processes that can be reported by some sandboxes, as well as reduce the execution time
by not analyzing such processes in the first place.
To use this, you can pick the PIDs of the processes you are interested in from the sandbox-generated process tree (or from the sandbox-reported malware PID)
and pass that to capa as follows: `capa report.log --restrict-to-processes 3888,3214,4299`. If you add the `-v` option then capa will tell you
which threads perform what actions (encrypt/decrypt data, initiate a connection, etc.).
### IDA Pro plugin: capa explorer ### IDA Pro plugin: capa explorer
Please check out the [capa explorer documentation](/capa/ida/plugin/README.md). Please check out the [capa explorer documentation](/capa/ida/plugin/README.md).
@@ -16,4 +32,4 @@ Please check out the [capa explorer documentation](/capa/ida/plugin/README.md).
Set the environment variable `CAPA_SAVE_WORKSPACE` to instruct the underlying analysis engine to Set the environment variable `CAPA_SAVE_WORKSPACE` to instruct the underlying analysis engine to
cache its intermediate results to the file system. For example, vivisect will create `.viv` files. cache its intermediate results to the file system. For example, vivisect will create `.viv` files.
Subsequently, capa may run faster when reprocessing the same input file. Subsequently, capa may run faster when reprocessing the same input file.
This is particularly useful during rule development as you repeatedly test a rule against a known sample. This is particularly useful during rule development as you repeatedly test a rule against a known sample.

View File

@@ -9,6 +9,7 @@
import textwrap import textwrap
import capa.capabilities.common import capa.capabilities.common
from capa.features.extractors.base_extractor import FunctionFilter
def test_match_across_scopes_file_function(z9324d_extractor): def test_match_across_scopes_file_function(z9324d_extractor):
@@ -174,6 +175,37 @@ def test_subscope_bb_rules(z9324d_extractor):
assert "test rule" in capabilities assert "test rule" in capabilities
def test_match_specific_functions(z9324d_extractor):
rules = capa.rules.RuleSet(
[
capa.rules.Rule.from_yaml(
textwrap.dedent(
"""
rule:
meta:
name: receive data
scopes:
static: function
dynamic: call
examples:
- 9324d1a8ae37a36ae560c37448c9705a:0x401CD0
features:
- or:
- api: recv
"""
)
)
]
)
extractor = FunctionFilter(z9324d_extractor, {0x4019C0})
capabilities, meta = capa.capabilities.common.find_capabilities(rules, extractor)
matches = capabilities["receive data"]
# test that we received only one match
assert len(matches) == 1
# and that this match is from the specified function
assert matches[0][0] == 0x4019C0
def test_byte_matching(z9324d_extractor): def test_byte_matching(z9324d_extractor):
rules = capa.rules.RuleSet( rules = capa.rules.RuleSet(
[ [