From 35e53e96916a4c84479b8dfe74a4fa586c4d7adc Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 8 Jun 2023 23:15:29 +0000 Subject: [PATCH 01/82] add abstract DynamicExtractor class --- capa/features/extractors/base_extractor.py | 104 ++++++++++++++++++++- 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 3be983ed..e3b780d1 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -8,7 +8,7 @@ import abc import dataclasses -from typing import Any, Dict, Tuple, Union, Iterator +from typing import Any, Dict, Tuple, Union, Iterator, TextIO, BinaryIO from dataclasses import dataclass import capa.features.address @@ -262,3 +262,105 @@ class FeatureExtractor: Tuple[Feature, Address]: feature and its location """ raise NotImplementedError() + + +@dataclass +class ProcessHandle: + """ + reference to a process extracted by the sandbox. + + Attributes: + pid: process id + inner: sandbox-specific data + """ + + pid: int + inner: Any + + +@dataclass +class ThreadHandle: + """ + reference to a thread extracted by the sandbox. + + Attributes: + tid: thread id + inner: sandbox-specific data + """ + + tid: int + inner: Any + + +class DynamicExtractor(FeatureExtractor): + """ + DynamicExtractor defines the interface for fetching features from a sandbox' analysis of a sample. + + Features are grouped mainly into threads that alongside their meta-features are also grouped into + processes (that also have their own features). Other scopes (such as function and file) may also apply + for a specific sandbox. + + This class is not instantiated directly; it is the base class for other implementations. + """ + + def __init__(self): + super().__init__() + + @abc.abstractmethod + def get_processes(self) -> Iterator[ProcessHandle]: + """ + Yields all the child-processes of a parent one. + + Attributes: + ph: parent process + """ + raise NotImplementedError() + + @abc.abstractmethod + def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: + """ + Yields all the features of a process. These include: + - file features of the process' image + - inter-process injection + - detected dynamic DLL loading + """ + raise NotImplementedError() + + @abc.abstractmethod + def get_threads(self, ph: ProcessHandle) -> Iterator[ProcessHandle]: + """ + Yields all the threads that a process created. + + Attributes: + ph: parent process + """ + raise NotImplementedError() + + @abc.abstractmethod + def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: + """ + Yields all the features of a thread. These include: + - sequenced api traces + - files/registris interacted with + - network activity + """ + raise NotImplementedError() + + @abc.abstractclassmethod + def from_trace(cls, trace: TextIO) -> "DynamicExtractor": + """ + Most sandboxes provide reports in a serialized text format (i.e. JSON for Cuckoo and CAPE). + This routine takes a file descriptor of such report (analysis trace) and returns a corresponding DynamicExtractor object. + """ + raise NotImplementedError() + + @abc.abstractclassmethod + def submit_sample(cls, sample: BinaryIO, api: Dict[str, str]) -> "DynamicExtractor": + """ + This routine takes a sample and submits it for analysis to the provided api. The trace should then ideally be passed to the from_trace() method. + + Attributes: + sample: file descriptor of the sample + api: contains information such as the uri, api key, etc. + """ + raise NotImplementedError() From dac103c621177eb3e967e781583d910859e9c0ec Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Fri, 9 Jun 2023 09:03:09 +0000 Subject: [PATCH 02/82] fix bad comment Co-authored-by: Moritz --- capa/features/extractors/base_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index e3b780d1..b006c762 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -341,7 +341,7 @@ class DynamicExtractor(FeatureExtractor): """ Yields all the features of a thread. These include: - sequenced api traces - - files/registris interacted with + - file/registry interactions - network activity """ raise NotImplementedError() From f243749d38bb831c429c9d717cc6c5700b1c3845 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Fri, 9 Jun 2023 09:03:49 +0000 Subject: [PATCH 03/82] get_threads(): fix mypy typing Co-authored-by: Moritz --- capa/features/extractors/base_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index b006c762..9911fd13 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -327,7 +327,7 @@ class DynamicExtractor(FeatureExtractor): raise NotImplementedError() @abc.abstractmethod - def get_threads(self, ph: ProcessHandle) -> Iterator[ProcessHandle]: + def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]: """ Yields all the threads that a process created. From a2b3a38f86ab08b97882b0129a1afd2744384993 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Sat, 10 Jun 2023 20:06:57 +0100 Subject: [PATCH 04/82] add the cape extractor's file hierarchy --- capa/features/extractors/cape/__init__.py | 0 capa/features/extractors/cape/extractor.py | 0 capa/features/extractors/cape/file.py | 0 capa/features/extractors/cape/process.py | 0 capa/features/extractors/cape/thread.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 capa/features/extractors/cape/__init__.py create mode 100644 capa/features/extractors/cape/extractor.py create mode 100644 capa/features/extractors/cape/file.py create mode 100644 capa/features/extractors/cape/process.py create mode 100644 capa/features/extractors/cape/thread.py diff --git a/capa/features/extractors/cape/__init__.py b/capa/features/extractors/cape/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py new file mode 100644 index 00000000..e69de29b diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py new file mode 100644 index 00000000..e69de29b diff --git a/capa/features/extractors/cape/process.py b/capa/features/extractors/cape/process.py new file mode 100644 index 00000000..e69de29b diff --git a/capa/features/extractors/cape/thread.py b/capa/features/extractors/cape/thread.py new file mode 100644 index 00000000..e69de29b From 86e2f83a7dbb5968dbed21caa68719a8da15a816 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Sun, 11 Jun 2023 23:19:24 +0100 Subject: [PATCH 05/82] extend the API feature to support an strace-like argument style --- capa/features/insn.py | 52 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/capa/features/insn.py b/capa/features/insn.py index f4be23c8..96396f6d 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -6,7 +6,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. import abc -from typing import Union, Optional +from typing import Tuple, Union, Optional, Dict import capa.helpers from capa.features.common import VALID_FEATURE_ACCESS, Feature @@ -21,9 +21,55 @@ def hex(n: int) -> str: class API(Feature): - def __init__(self, name: str, description=None): - super().__init__(name, description=description) + def __init__(self, signature: str, description=None): + if signature.isidentifier(): + # api call is in the legacy format + super().__init__(signature, description=description) + self.args = {} + self.ret = False + else: + # api call is in the strace format and therefore has to be parsed + name, self.args, self.ret = self.parse_signature(signature) + super().__init__(name, description=description) + # store the original signature for hashing purposes + self.signature = signature + + def __hash__(self): + return hash(self.signature) + + def __eq__(self, other): + if not isinstance(other, API): + return False + + assert(isinstance(other, API)) + if {} in (self.args, other.args) or False in (self.ret, other.ret): + # Legacy API feature + return super().__eq__(other) + + # API call with arguments + return super().__eq__(other) and self.args == other.args and self.ret == other.ret + + def parse_signature(self, signature: str) -> Tuple[str, Optional[Dict[str, str]], Optional[str]]: + # todo: optimize this method and improve the code quality + import re + + args = ret = False + + match = re.findall(r"(.+\(.*\)) ?=? ?([^=]*)", signature) + if not match: + return "", None, None + if len(match[0]) == 2: + ret = match[0][1] + + match = re.findall(r"(.*)\((.*)\)", match[0][0]) + if len(match[0]) == 2: + args = (match[0][1]+", ").split(", ") + map(lambda x: {f"arg{x[0]}": x[1]}, enumerate(args)) + args = [{} | arg for arg in args][0] + + return match[0][0], args, ret + class _AccessFeature(Feature, abc.ABC): # superclass: don't use directly From efe1d1c0acc85cca29c0017e960497decc2e9ec8 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 12 Jun 2023 00:05:20 +0100 Subject: [PATCH 06/82] add a Registry feature --- capa/features/common.py | 12 ++++++++++++ capa/rules/__init__.py | 2 ++ 2 files changed, 14 insertions(+) diff --git a/capa/features/common.py b/capa/features/common.py index 5060ebaa..812889e3 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -272,6 +272,18 @@ class _MatchedSubstring(Substring): return f'substring("{self.value}", matches = {matches})' +class Registry(String): + # todo: add a way to tell whether this registry key was created, accessed, or deleted. + def __init__(self, value: str, description=None): + super().__init__(value, description) + + def __eq__(self, other): + # Registry instance is in a ruleset + if isinstance(other, Registry): + return super().__eq__(other) + return False + + class Regex(String): def __init__(self, value: str, description=None): super().__init__(value, description=description) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 64fd7e37..d83b6717 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -261,6 +261,8 @@ def parse_feature(key: str): return capa.features.common.StringFactory elif key == "substring": return capa.features.common.Substring + elif key == "registry": + return capa.features.common.Registry elif key == "bytes": return capa.features.common.Bytes elif key == "number": From 632b3ff07c0e4a2d59fdefb24b9d672088b69b78 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 12 Jun 2023 00:06:05 +0100 Subject: [PATCH 07/82] add a Filename feature --- capa/features/common.py | 12 ++++++++++++ capa/rules/__init__.py | 2 ++ 2 files changed, 14 insertions(+) diff --git a/capa/features/common.py b/capa/features/common.py index 812889e3..2563887a 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -284,6 +284,18 @@ class Registry(String): return False +class Filename(String): + # todo: add a way to tell whether this file was created, accessed, or deleted. + def __init__(self, value: str, description=None): + super().__init__(value, description) + + def __eq__(self, other): + # Mutex instance is in a ruleset + if isinstance(other, Filename): + return super().__eq__(other) + return False + + class Regex(String): def __init__(self, value: str, description=None): super().__init__(value, description=description) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index d83b6717..9000fe92 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -263,6 +263,8 @@ def parse_feature(key: str): return capa.features.common.Substring elif key == "registry": return capa.features.common.Registry + elif key == "filename": + return capa.features.common.Filename elif key == "bytes": return capa.features.common.Bytes elif key == "number": From 5a10b612a1b206e7cdba5026339bb62377ab35bc Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 12 Jun 2023 00:06:53 +0100 Subject: [PATCH 08/82] add a Mutex feature --- capa/features/common.py | 12 ++++++++++++ capa/rules/__init__.py | 2 ++ 2 files changed, 14 insertions(+) diff --git a/capa/features/common.py b/capa/features/common.py index 2563887a..8318dee5 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -296,6 +296,18 @@ class Filename(String): return False +class Mutex(String): + # todo: add a way to tell whether this mutex was created or used + def __init__(self, value: str, description=None): + super().__init__(value, description) + + def __eq__(self, other): + # Mutex instance is in a ruleset + if isinstance(other, Mutex): + return super().__eq__(other) + return False + + class Regex(String): def __init__(self, value: str, description=None): super().__init__(value, description=description) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 9000fe92..01908790 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -265,6 +265,8 @@ def parse_feature(key: str): return capa.features.common.Registry elif key == "filename": return capa.features.common.Filename + elif key == "mutex": + return capa.features.common.Mutex elif key == "bytes": return capa.features.common.Bytes elif key == "number": From a6ca3aaa666d80614d8b700abac36c46f439e629 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 13 Jun 2023 14:23:50 +0100 Subject: [PATCH 09/82] remove from_trace() and submit_sample() methods --- capa/features/extractors/base_extractor.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 9911fd13..c3d04736 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -345,22 +345,3 @@ class DynamicExtractor(FeatureExtractor): - network activity """ raise NotImplementedError() - - @abc.abstractclassmethod - def from_trace(cls, trace: TextIO) -> "DynamicExtractor": - """ - Most sandboxes provide reports in a serialized text format (i.e. JSON for Cuckoo and CAPE). - This routine takes a file descriptor of such report (analysis trace) and returns a corresponding DynamicExtractor object. - """ - raise NotImplementedError() - - @abc.abstractclassmethod - def submit_sample(cls, sample: BinaryIO, api: Dict[str, str]) -> "DynamicExtractor": - """ - This routine takes a sample and submits it for analysis to the provided api. The trace should then ideally be passed to the from_trace() method. - - Attributes: - sample: file descriptor of the sample - api: contains information such as the uri, api key, etc. - """ - raise NotImplementedError() From 3aa7c96902697e3f8251cfaa8ef7b6a389fd5ee8 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 13 Jun 2023 22:54:52 +0100 Subject: [PATCH 10/82] add cape extractor class --- capa/features/extractors/cape/extractor.py | 66 ++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index e69de29b..a402c3a3 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -0,0 +1,66 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import logging +from typing import Any, Dict, List, Tuple, Iterator + +import capa.features.extractors.cape.global_ +import capa.features.extractors.cape.process +import capa.features.extractors.cape.file +import capa.features.extractors.cape.thread +from capa.features.common import Feature +from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.extractors.base_extractor import ProcessHandle, ThreadHandle, DynamicExtractor + +logger = logging.getLogger(__name__) + + +class CapeExtractor(DynamicExtractor): + def __init__(self, static: Dict, behavior: Dict, network: Dict): + super().__init__() + self.static = static + self.behavior = behavior + + self.global_features = capa.features.extractors.cape.global_.extract_features(self.static) + + + def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: + yield from self.global_features + + def get_file_features(self) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.cape.file.extract_features(self.static) + + def get_processes(self) -> Iterator[ProcessHandle]: + yield from capa.features.extractors.cape.process.get_processes(self.behavior) + + def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.cape.process.extract_features(self.behavior, ph) + + def get_threads(self, ph: ProcessHandle) -> Iterator[ProcessHandle]: + yield from capa.features.extractors.cape.process.get_threads(self.behavior, ph) + + def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.cape.thread.extract_features(self.behavior, ph, th) + + + @classmethod + def from_report(cls, report: Dict) -> "DynamicExtractor": + # todo: + # 1. make the information extraction code more elegant + # 2. filter out redundant cape features in an efficient way + static = report["static"] + format_ = list(static.keys())[0] + static = static[format_] + static.update(report["target"]) + static.update({"format": format_}) + + behavior = report.pop("behavior") + behavior.update(behavior.pop("summary")) + behavior["network"] = report.pop("network") + + return cls(static, behavior) \ No newline at end of file From 0274cf3ec717141b7b1e1f9a7dc2f6c950b7dc9a Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 13 Jun 2023 22:55:42 +0100 Subject: [PATCH 11/82] add cape's global features' extraction module --- capa/features/extractors/cape/global_.py | 93 ++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 capa/features/extractors/cape/global_.py diff --git a/capa/features/extractors/cape/global_.py b/capa/features/extractors/cape/global_.py new file mode 100644 index 00000000..c4f13840 --- /dev/null +++ b/capa/features/extractors/cape/global_.py @@ -0,0 +1,93 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import logging +from typing import Tuple, Iterator + +from capa.features.address import Address, NO_ADDRESS +from capa.features.common import ( + OS, + OS_ANY, + ARCH_I386, + ARCH_AMD64, + ARCH_ANY, + FORMAT_PE, + FORMAT_ELF, + FORMAT_UNKNOWN, + OS_WINDOWS, + OS_LINUX, + Arch, + Format, + Feature, +) + + +logger = logging.getLogger(__name__) + + +def guess_elf_os(file_output) -> Iterator[Tuple[Feature, Address]]: + # operating systems recognized by the file command: https://github.com/file/file/blob/master/src/readelf.c#L609 + if "Linux" in file_output: + return OS(OS_LINUX), NO_ADDRESS + elif "Hurd" in file_output: + return OS("hurd"), NO_ADDRESS + elif "Solairs" in file_output: + return OS("solaris"), NO_ADDRESS + elif "kFreeBSD" in file_output: + return OS("freebsd"), NO_ADDRESS + elif "kNetBSD" in file_output: + return OS("netbsd"), NO_ADDRESS + else: + return OS(OS_ANY), NO_ADDRESS + + +def extract_arch(static) -> Iterator[Tuple[Feature, Address]]: + if "Intel 80386" in static["target"]["type"]: + return Arch(ARCH_I386), NO_ADDRESS + elif "x86-64" in static["target"]["type"]: + return Arch(ARCH_AMD64), NO_ADDRESS + else: + return Arch(ARCH_ANY) + + +def extract_format(static) -> Iterator[Tuple[Feature, Address]]: + if "PE" in static["target"]["type"]: + return Format(FORMAT_PE), NO_ADDRESS + elif "ELF" in static["target"]["type"]: + return Format(FORMAT_ELF), NO_ADDRESS + else: + logger.debug(f"unknown file format, file command output: {static['target']['type']}") + return Format(FORMAT_UNKNOWN), NO_ADDRESS + + +def extract_os(static) -> Iterator[Tuple[Feature, Address]]: + # CAPE includes the output of the file command in the + file_command = static["target"]["type"] + + if "WINDOWS" in file_command: + return OS(OS_WINDOWS), NO_ADDRESS + elif "ELF" in file_command: + # implement os guessing from the cape trace + return guess_elf_os(file_command) + else: + # the sample is shellcode + logger.debug(f"unsupported file format, file command output: {file_command}") + return OS(OS_ANY), NO_ADDRESS + + +def extract_features(static) -> Iterator[Tuple[Feature, Address]]: + for global_handler in GLOBAL_HANDLER: + for feature, va in global_handler(static): + yield feature, va + + +GLOBAL_HANDLER = ( + extract_arch, + extract_format, + extract_os, +) \ No newline at end of file From a7917a0f3dbcddf176eb2863eb70da3df1c951e1 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 13 Jun 2023 22:56:15 +0100 Subject: [PATCH 12/82] add cape's thread features' extraction module --- capa/features/extractors/cape/thread.py | 54 +++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/capa/features/extractors/cape/thread.py b/capa/features/extractors/cape/thread.py index e69de29b..08ade933 100644 --- a/capa/features/extractors/cape/thread.py +++ b/capa/features/extractors/cape/thread.py @@ -0,0 +1,54 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import logging +from typing import Any, Dict, List, Tuple, Iterator + +import capa.features.extractors.cape.global_ +import capa.features.extractors.cape.process +import capa.features.extractors.cape.file +import capa.features.extractors.cape.thread +from capa.features.common import Feature, String +from capa.features.insn import API, Number +from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.extractors.base_extractor import ProcessHandle, ThreadHandle, DynamicExtractor + + +logger = logging.getLogger(__name__) + + +def extract_call_features(calls: List[Dict], th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: + tid = str(th.tid) + for call in calls: + if call["thead_id"] != tid: + continue + + yield API(call["api"]), int(call["caller"], 16) + yield Number(int(call["return"], 16)), int(call["caller"], 16) + for arg in call["arguments"]: + if arg["value"].isdecimal(): + yield Number(int(arg["value"])), int(call["caller"], 16) + continue + try: + yield Number(int(arg["value"], 16)), int(call["caller"], 16) + except: + yield String{arg["value"]}, int(call["caller"], 16) + + +def extract_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: + processes: List = behavior["processes"] + search_result = list(map(lambda proc: proc["process_id"] == ph.pid and proc["parent_id"] == ph.ppid, processes)) + process = processes[search_result.index(True)] + + for handler in THREAD_HANDLERS: + handler(process["calls"]) + + +THREAD_HANDLERS = ( + extract_call_features, +) \ No newline at end of file From 5ee4fc2cd54ddae0292c6374170ef0fd0bc15aa4 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 13 Jun 2023 23:02:00 +0100 Subject: [PATCH 13/82] add parent process id to the process handle --- capa/features/extractors/base_extractor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index c3d04736..5724e628 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -274,6 +274,7 @@ class ProcessHandle: inner: sandbox-specific data """ + ppid: int pid: int inner: Any From ece47c9ed5ff24465de2838b9b4c0941a92c0a02 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 14 Jun 2023 09:05:53 +0100 Subject: [PATCH 14/82] add ppid documentation to the dynamic extractor interface --- capa/features/extractors/base_extractor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 5724e628..b0b8126c 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -270,6 +270,7 @@ class ProcessHandle: reference to a process extracted by the sandbox. Attributes: + ppid: parent process id pid: process id inner: sandbox-specific data """ From baf209f3cc59ac43d63edeb5d697a0fd462edf4c Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Wed, 14 Jun 2023 09:33:07 +0100 Subject: [PATCH 15/82] remove ppid member from ProcessHandle Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index b0b8126c..9c672706 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -275,7 +275,6 @@ class ProcessHandle: inner: sandbox-specific data """ - ppid: int pid: int inner: Any From edcfece993964bab3305db48828ef7073355c777 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Wed, 14 Jun 2023 09:33:24 +0100 Subject: [PATCH 16/82] remove default implementation Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 9c672706..32911d39 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -303,10 +303,6 @@ class DynamicExtractor(FeatureExtractor): This class is not instantiated directly; it is the base class for other implementations. """ - - def __init__(self): - super().__init__() - @abc.abstractmethod def get_processes(self) -> Iterator[ProcessHandle]: """ From 7198ebefc92ea895b89e832baaf89f8cdebec3e7 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Wed, 14 Jun 2023 09:58:33 +0100 Subject: [PATCH 17/82] remove redundant types Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 32911d39..8dd3cdf7 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -8,7 +8,7 @@ import abc import dataclasses -from typing import Any, Dict, Tuple, Union, Iterator, TextIO, BinaryIO +from typing import Any, Dict, Tuple, Union, Iterator from dataclasses import dataclass import capa.features.address From 23deb4143636916b54e652a1df5bac5a2d549d21 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 14 Jun 2023 10:58:50 +0200 Subject: [PATCH 18/82] Update capa/features/extractors/base_extractor.py --- capa/features/extractors/base_extractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 8dd3cdf7..a9a06d3b 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -270,7 +270,6 @@ class ProcessHandle: reference to a process extracted by the sandbox. Attributes: - ppid: parent process id pid: process id inner: sandbox-specific data """ From 7a94f524b49977b03ee73a83092afb32502b9739 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 14 Jun 2023 10:58:59 +0200 Subject: [PATCH 19/82] Update capa/features/extractors/base_extractor.py --- capa/features/extractors/base_extractor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index a9a06d3b..e4d61bc2 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -305,10 +305,7 @@ class DynamicExtractor(FeatureExtractor): @abc.abstractmethod def get_processes(self) -> Iterator[ProcessHandle]: """ - Yields all the child-processes of a parent one. - - Attributes: - ph: parent process + Enumerate processes in the trace. """ raise NotImplementedError() From 4c701f4b6c89668a7458416bd9f919a741ea4cad Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 14 Jun 2023 10:59:07 +0200 Subject: [PATCH 20/82] Update capa/features/extractors/base_extractor.py --- capa/features/extractors/base_extractor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index e4d61bc2..cc488fa3 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -322,10 +322,7 @@ class DynamicExtractor(FeatureExtractor): @abc.abstractmethod def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]: """ - Yields all the threads that a process created. - - Attributes: - ph: parent process + Enumerate threads in the given process. """ raise NotImplementedError() From 18715dbe2e2d61581d37c27f60533f0de6e8e8fa Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 14 Jun 2023 09:02:18 +0100 Subject: [PATCH 21/82] fix typo bug --- capa/features/extractors/cape/thread.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/cape/thread.py b/capa/features/extractors/cape/thread.py index 08ade933..6389254f 100644 --- a/capa/features/extractors/cape/thread.py +++ b/capa/features/extractors/cape/thread.py @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) def extract_call_features(calls: List[Dict], th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: tid = str(th.tid) for call in calls: - if call["thead_id"] != tid: + if call["thread_id"] != tid: continue yield API(call["api"]), int(call["caller"], 16) From a66c55ca14dec60e502b064de127625bbc2a7d07 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 14 Jun 2023 22:34:11 +0100 Subject: [PATCH 22/82] add the initial version of the cape extractor --- capa/features/extractors/cape/extractor.py | 5 +- capa/features/extractors/cape/file.py | 68 +++++++++++++++++++++ capa/features/extractors/cape/global_.py | 6 +- capa/features/extractors/cape/process.py | 71 ++++++++++++++++++++++ capa/features/extractors/cape/thread.py | 43 ++++++++----- 5 files changed, 173 insertions(+), 20 deletions(-) diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index a402c3a3..1d3e37c1 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -7,14 +7,14 @@ # See the License for the specific language governing permissions and limitations under the License. import logging -from typing import Any, Dict, List, Tuple, Iterator +from typing import Dict, Tuple, Iterator import capa.features.extractors.cape.global_ import capa.features.extractors.cape.process import capa.features.extractors.cape.file import capa.features.extractors.cape.thread from capa.features.common import Feature -from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.address import Address from capa.features.extractors.base_extractor import ProcessHandle, ThreadHandle, DynamicExtractor logger = logging.getLogger(__name__) @@ -57,6 +57,7 @@ class CapeExtractor(DynamicExtractor): format_ = list(static.keys())[0] static = static[format_] static.update(report["target"]) + static.update({"strings": report["strings"]}) static.update({"format": format_}) behavior = report.pop("behavior") diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index e69de29b..00ea597f 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -0,0 +1,68 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import logging +from typing import Any, Dict, List, Tuple, Iterator + +from capa.features.common import Feature, String +from capa.features.file import Section, Import, Export, FunctionName +from capa.features.address import Address, AbsoluteVirtualAddress, NO_ADDRESS + + +logger = logging.getLogger(__name__) + + +def extract_import_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: + """ + extract the names of imported library files, for example: USER32.dll + """ + for library in static["imports"]: + name, address = library["name"], int(library["virtual_address"], 16) + yield Import(name), address + + +def extract_export_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: + for function in static["exports"]: + name, address = function["name"], int(function["virtual_address"], 16) + yield Export(name), address + + +def extract_section_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: + for section in static["sections"]: + name, address = section["name"], int(section["virtual_address"], 16) + yield Section(name), address + + +def extract_function_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: + """ + extract the names of imported functions. + """ + for library in static["imports"]: + for function in library["imports"]: + name, address = function["name"], int(function["address"], 16) + yield FunctionName(name), AbsoluteVirtualAddress(address) + + +def extract_file_strings(static: Dict) -> Iterator[Tuple[Feature, Address]]: + for string_ in static["strings"]: + yield String(string_), NO_ADDRESS + + +def extract_features(static: Dict) -> Iterator[Tuple[Feature, Address]]: + for handler in FILE_HANDLERS: + for feature, addr in handler(static): + yield feature, addr + + +FILE_HANDLERS = ( + extract_import_names, + extract_export_names, + extract_section_names, + extract_function_names, + extract_file_strings, +) \ No newline at end of file diff --git a/capa/features/extractors/cape/global_.py b/capa/features/extractors/cape/global_.py index c4f13840..a6621f6a 100644 --- a/capa/features/extractors/cape/global_.py +++ b/capa/features/extractors/cape/global_.py @@ -66,7 +66,7 @@ def extract_format(static) -> Iterator[Tuple[Feature, Address]]: def extract_os(static) -> Iterator[Tuple[Feature, Address]]: - # CAPE includes the output of the file command in the + # this variable contains the output of the file command file_command = static["target"]["type"] if "WINDOWS" in file_command: @@ -82,8 +82,8 @@ def extract_os(static) -> Iterator[Tuple[Feature, Address]]: def extract_features(static) -> Iterator[Tuple[Feature, Address]]: for global_handler in GLOBAL_HANDLER: - for feature, va in global_handler(static): - yield feature, va + for feature, addr in global_handler(static): + yield feature, addr GLOBAL_HANDLER = ( diff --git a/capa/features/extractors/cape/process.py b/capa/features/extractors/cape/process.py index e69de29b..8f91521b 100644 --- a/capa/features/extractors/cape/process.py +++ b/capa/features/extractors/cape/process.py @@ -0,0 +1,71 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import logging +from typing import Any, Dict, List, Tuple, Iterator + +import capa.features.extractors.cape.global_ +import capa.features.extractors.cape.process +import capa.features.extractors.cape.file +import capa.features.extractors.cape.thread +from capa.features.common import Feature, String +from capa.features.address import Address, AbsoluteVirtualAddress, NO_ADDRESS +from capa.features.extractors.base_extractor import ProcessHandle, ThreadHandle, DynamicExtractor + +logger = logging.getLogger(__name__) + + +def get_processes(behavior: Dict) -> Iterator[ProcessHandle]: + """ + get all created processes for a sample + """ + for process in behavior["processes"]: + inner: Dict[str, str] = {"name": process["name"], "ppid": process["parent_id"]} + yield ProcessHandle(pid=process["process_id"], inner=inner) + + +def get_threads(behavior: Dict, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: + """ + get a thread's child processes + """ + + threads: List = None + for process in behavior["processes"]: + if ph.pid == process["process_id"] and ph.inner["ppid"] == process["parent_id"]: + threads = process["threads"] + + for thread in threads: + yield ThreadHandle(int(thread)) + + +def extract_environ_strings(behavior: Dict, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: + """ + extract strings from a process' provided environment variables. + """ + environ: Dict[str, str] = None + for process in behavior["processes"]: + if ph.pid == process["process_id"] and ph.inner["ppid"] == process["parent_id"]: + environ = process["environ"] + + if not environ: + return + + for (variable, value) in environ.items(): + if value: + yield String(value), NO_ADDRESS + + +def extract_features(behavior: Dict, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: + for handler in PROCESS_HANDLERS: + for feature, addr in handler(behavior, ph): + yield feature, addr + + +PROCESS_HANDLERS = ( + extract_environ_strings +) \ No newline at end of file diff --git a/capa/features/extractors/cape/thread.py b/capa/features/extractors/cape/thread.py index 6389254f..def3ccf0 100644 --- a/capa/features/extractors/cape/thread.py +++ b/capa/features/extractors/cape/thread.py @@ -9,44 +9,57 @@ import logging from typing import Any, Dict, List, Tuple, Iterator -import capa.features.extractors.cape.global_ -import capa.features.extractors.cape.process -import capa.features.extractors.cape.file -import capa.features.extractors.cape.thread from capa.features.common import Feature, String from capa.features.insn import API, Number -from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import ProcessHandle, ThreadHandle, DynamicExtractor +from capa.features.address import Address +from capa.features.extractors.base_extractor import ProcessHandle, ThreadHandle logger = logging.getLogger(__name__) -def extract_call_features(calls: List[Dict], th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: +def extract_call_features(behavior: Dict, ph:ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: + """ + this method goes through the specified thread's call trace, and extracts all possible + features such as: API, Number (for arguments), String (for arguments). + + args: + behavior: a dictionary of behavioral artifacts extracted by the sandbox + ph: process handle (for defining the extraction scope) + th: thread handle (for defining the extraction scope) + + yields: + Feature, address; where Feature is either: API, Number, or String. + """ + + calls:List[Dict] = None + for process in behavior["processes"]: + if ph.pid == process["process_id"] and ph.inner["ppid"] == process["parent_id"]: + calls:List[Dict] = process + tid = str(th.tid) for call in calls: if call["thread_id"] != tid: continue - - yield API(call["api"]), int(call["caller"], 16) yield Number(int(call["return"], 16)), int(call["caller"], 16) + yield API(call["api"]), int(call["caller"], 16) for arg in call["arguments"]: if arg["value"].isdecimal(): yield Number(int(arg["value"])), int(call["caller"], 16) continue try: + # argument could be in hexadecimal yield Number(int(arg["value"], 16)), int(call["caller"], 16) except: - yield String{arg["value"]}, int(call["caller"], 16) + if arg["value"]: + # argument is a non-empty string + yield String(arg["value"]), int(call["caller"], 16) def extract_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: - processes: List = behavior["processes"] - search_result = list(map(lambda proc: proc["process_id"] == ph.pid and proc["parent_id"] == ph.ppid, processes)) - process = processes[search_result.index(True)] - for handler in THREAD_HANDLERS: - handler(process["calls"]) + for feature, addr in handler(behavior, ph, th): + yield feature, addr THREAD_HANDLERS = ( From 0cd481b1497c2f80194d94a3f149d02b5c00ceed Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Wed, 14 Jun 2023 22:42:25 +0100 Subject: [PATCH 23/82] remove redundant comments Co-authored-by: Moritz --- capa/features/common.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index 8318dee5..1362c538 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -278,7 +278,6 @@ class Registry(String): super().__init__(value, description) def __eq__(self, other): - # Registry instance is in a ruleset if isinstance(other, Registry): return super().__eq__(other) return False @@ -290,7 +289,6 @@ class Filename(String): super().__init__(value, description) def __eq__(self, other): - # Mutex instance is in a ruleset if isinstance(other, Filename): return super().__eq__(other) return False @@ -302,7 +300,6 @@ class Mutex(String): super().__init__(value, description) def __eq__(self, other): - # Mutex instance is in a ruleset if isinstance(other, Mutex): return super().__eq__(other) return False From 58d42b09d96c983add9f1df51e5de0fd507f5f79 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 14 Jun 2023 09:05:53 +0100 Subject: [PATCH 24/82] add ppid documentation to the dynamic extractor interface --- capa/features/extractors/base_extractor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 5724e628..b0b8126c 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -270,6 +270,7 @@ class ProcessHandle: reference to a process extracted by the sandbox. Attributes: + ppid: parent process id pid: process id inner: sandbox-specific data """ From a8f928200be545aeec6fa00d297adcddb1e81210 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Wed, 14 Jun 2023 09:33:07 +0100 Subject: [PATCH 25/82] remove ppid member from ProcessHandle Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index b0b8126c..9c672706 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -275,7 +275,6 @@ class ProcessHandle: inner: sandbox-specific data """ - ppid: int pid: int inner: Any From 64c4f0f1aa221bcd18664a347270401e196b04be Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Wed, 14 Jun 2023 09:33:24 +0100 Subject: [PATCH 26/82] remove default implementation Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 9c672706..32911d39 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -303,10 +303,6 @@ class DynamicExtractor(FeatureExtractor): This class is not instantiated directly; it is the base class for other implementations. """ - - def __init__(self): - super().__init__() - @abc.abstractmethod def get_processes(self) -> Iterator[ProcessHandle]: """ From dcce4db6d53049f17912d1c1210bdde231c790fc Mon Sep 17 00:00:00 2001 From: Capa Bot Date: Mon, 12 Jun 2023 06:58:29 +0000 Subject: [PATCH 27/82] Sync capa rules submodule --- CHANGELOG.md | 3 ++- README.md | 2 +- rules | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a736a60..c553d088 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ ### Breaking Changes - Update Metadata type in capa main [#1411](https://github.com/mandiant/capa/issues/1411) [@Aayush-Goel-04](https://github.com/aayush-goel-04) @manasghandat -### New Rules (8) +### New Rules (9) - load-code/shellcode/execute-shellcode-via-windows-callback-function ervin.ocampo@mandiant.com jakub.jozwiak@mandiant.com - nursery/execute-shellcode-via-indirect-call ronnie.salomonsen@mandiant.com @@ -19,6 +19,7 @@ - nursery/hash-data-using-sha512managed-in-dotnet jonathanlepore@google.com - nursery/compiled-with-exescript jonathanlepore@google.com - nursery/check-for-sandbox-via-mac-address-ouis-in-dotnet jonathanlepore@google.com +- host-interaction/hardware/enumerate-devices-by-category @mr-tz - ### Bug Fixes diff --git a/README.md b/README.md index 809a5651..8bfa9207 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/flare-capa)](https://pypi.org/project/flare-capa) [![Last release](https://img.shields.io/github/v/release/mandiant/capa)](https://github.com/mandiant/capa/releases) -[![Number of rules](https://img.shields.io/badge/rules-800-blue.svg)](https://github.com/mandiant/capa-rules) +[![Number of rules](https://img.shields.io/badge/rules-801-blue.svg)](https://github.com/mandiant/capa-rules) [![CI status](https://github.com/mandiant/capa/workflows/CI/badge.svg)](https://github.com/mandiant/capa/actions?query=workflow%3ACI+event%3Apush+branch%3Amaster) [![Downloads](https://img.shields.io/github/downloads/mandiant/capa/total)](https://github.com/mandiant/capa/releases) [![License](https://img.shields.io/badge/license-Apache--2.0-green.svg)](LICENSE.txt) diff --git a/rules b/rules index 5f433fdf..baab4e37 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit 5f433fdf8ea03b592db035b6b0c934bf04bb0812 +Subproject commit baab4e37d3bf7749980663b41a36c89cb9fdadcc From a7aa817dceaea91c9b95fc9b46729a138851c679 Mon Sep 17 00:00:00 2001 From: Xusheng Date: Fri, 9 Jun 2023 11:34:03 +0800 Subject: [PATCH 28/82] Update the stack string detection with BN's builtin outlining of constant expressions --- CHANGELOG.md | 1 + capa/features/extractors/binja/basicblock.py | 72 +++++++++++++++++++- 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c553d088..d5a4b6c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ - ### Bug Fixes +- extractor: update Binary Ninja stack string detection after the new constant outlining feature #1473 @xusheng6 - extractor: update vivisect Arch extraction #1334 @mr-tz - extractor: avoid Binary Ninja exception when analyzing certain files #1441 @xusheng6 - symtab: fix struct.unpack() format for 64-bit ELF files @yelhamer diff --git a/capa/features/extractors/binja/basicblock.py b/capa/features/extractors/binja/basicblock.py index ff464b1d..e354669d 100644 --- a/capa/features/extractors/binja/basicblock.py +++ b/capa/features/extractors/binja/basicblock.py @@ -11,10 +11,13 @@ import string import struct from typing import Tuple, Iterator -from binaryninja import Function +from binaryninja import Function, Settings from binaryninja import BasicBlock as BinjaBasicBlock from binaryninja import ( BinaryView, + DataBuffer, + SymbolType, + RegisterValueType, VariableSourceType, MediumLevelILSetVar, MediumLevelILOperation, @@ -28,6 +31,66 @@ from capa.features.basicblock import BasicBlock from capa.features.extractors.helpers import MIN_STACKSTRING_LEN from capa.features.extractors.base_extractor import BBHandle, FunctionHandle +use_const_outline: bool = False +settings: Settings = Settings() +if settings.contains("analysis.outlining.builtins") and settings.get_bool("analysis.outlining.builtins"): + use_const_outline = True + + +def get_printable_len_ascii(s: bytes) -> int: + """Return string length if all operand bytes are ascii or utf16-le printable""" + count = 0 + for c in s: + if c == 0: + return count + if c < 127 and chr(c) in string.printable: + count += 1 + return count + + +def get_printable_len_wide(s: bytes) -> int: + """Return string length if all operand bytes are ascii or utf16-le printable""" + if all(c == 0x00 for c in s[1::2]): + return get_printable_len_ascii(s[::2]) + return 0 + + +def get_stack_string_len(f: Function, il: MediumLevelILInstruction) -> int: + bv: BinaryView = f.view + + if il.operation != MediumLevelILOperation.MLIL_CALL: + return 0 + + target = il.dest + if target.operation not in [MediumLevelILOperation.MLIL_CONST, MediumLevelILOperation.MLIL_CONST_PTR]: + return 0 + + addr = target.value.value + sym = bv.get_symbol_at(addr) + if not sym or sym.type != SymbolType.LibraryFunctionSymbol: + return 0 + + if sym.name not in ["__builtin_strncpy", "__builtin_strcpy", "__builtin_wcscpy"]: + return 0 + + if len(il.params) < 2: + return 0 + + dest = il.params[0] + if dest.operation != MediumLevelILOperation.MLIL_ADDRESS_OF: + return 0 + + var = dest.src + if var.source_type != VariableSourceType.StackVariableSourceType: + return 0 + + src = il.params[1] + if src.value.type != RegisterValueType.ConstantDataAggregateValue: + return 0 + + s = f.get_constant_data(RegisterValueType.ConstantDataAggregateValue, src.value.value) + return max(get_printable_len_ascii(bytes(s)), get_printable_len_wide(bytes(s))) + def get_printable_len(il: MediumLevelILSetVar) -> int: """Return string length if all operand bytes are ascii or utf16-le printable""" @@ -82,8 +145,11 @@ def bb_contains_stackstring(f: Function, bb: MediumLevelILBasicBlock) -> bool: """ count = 0 for il in bb: - if is_mov_imm_to_stack(il): - count += get_printable_len(il) + if use_const_outline: + count += get_stack_string_len(f, il) + else: + if is_mov_imm_to_stack(il): + count += get_printable_len(il) if count > MIN_STACKSTRING_LEN: return True From e671e1c87c41437b6218a21c7e74b52f0a775b65 Mon Sep 17 00:00:00 2001 From: Xusheng Date: Fri, 9 Jun 2023 13:41:31 +0800 Subject: [PATCH 29/82] Add a test that asserts on the binja version --- CHANGELOG.md | 1 + tests/test_binja_features.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5a4b6c4..69023a2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ - ### Bug Fixes +- extractor: add a Binary Ninja test that asserts its version #1487 @xusheng6 - extractor: update Binary Ninja stack string detection after the new constant outlining feature #1473 @xusheng6 - extractor: update vivisect Arch extraction #1334 @mr-tz - extractor: avoid Binary Ninja exception when analyzing certain files #1441 @xusheng6 diff --git a/tests/test_binja_features.py b/tests/test_binja_features.py index 06e91ff1..04c8a49e 100644 --- a/tests/test_binja_features.py +++ b/tests/test_binja_features.py @@ -55,3 +55,9 @@ def test_standalone_binja_backend(): CD = os.path.dirname(__file__) test_path = os.path.join(CD, "..", "tests", "data", "Practical Malware Analysis Lab 01-01.exe_") assert capa.main.main([test_path, "-b", capa.main.BACKEND_BINJA]) == 0 + + +@pytest.mark.skipif(binja_present is False, reason="Skip binja tests if the binaryninja Python API is not installed") +def test_binja_version(): + version = binaryninja.core_version_info() + assert version.major == 3 and version.minor == 4 From f55804ef069c622e19db29c1db8dfbcf2b5a8ad7 Mon Sep 17 00:00:00 2001 From: Capa Bot Date: Mon, 12 Jun 2023 12:18:23 +0000 Subject: [PATCH 30/82] Sync capa rules submodule --- rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules b/rules index baab4e37..1ecaa98d 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit baab4e37d3bf7749980663b41a36c89cb9fdadcc +Subproject commit 1ecaa98de4a2040d10b519c6b9a8a8228d417655 From 51faaae1d0252d0ec10f227338ba016cea550d9b Mon Sep 17 00:00:00 2001 From: Capa Bot Date: Mon, 12 Jun 2023 12:28:18 +0000 Subject: [PATCH 31/82] Sync capa rules submodule --- README.md | 2 +- rules | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8bfa9207..809a5651 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/flare-capa)](https://pypi.org/project/flare-capa) [![Last release](https://img.shields.io/github/v/release/mandiant/capa)](https://github.com/mandiant/capa/releases) -[![Number of rules](https://img.shields.io/badge/rules-801-blue.svg)](https://github.com/mandiant/capa-rules) +[![Number of rules](https://img.shields.io/badge/rules-800-blue.svg)](https://github.com/mandiant/capa-rules) [![CI status](https://github.com/mandiant/capa/workflows/CI/badge.svg)](https://github.com/mandiant/capa/actions?query=workflow%3ACI+event%3Apush+branch%3Amaster) [![Downloads](https://img.shields.io/github/downloads/mandiant/capa/total)](https://github.com/mandiant/capa/releases) [![License](https://img.shields.io/badge/license-Apache--2.0-green.svg)](LICENSE.txt) diff --git a/rules b/rules index 1ecaa98d..368a27e7 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit 1ecaa98de4a2040d10b519c6b9a8a8228d417655 +Subproject commit 368a27e739cdedfa37588ff8176a809159aa562b From 6e3b1bc2409248cf58f1786693b57622bd4778b0 Mon Sep 17 00:00:00 2001 From: Stephen Eckels Date: Tue, 13 Jun 2023 14:00:06 -0400 Subject: [PATCH 32/82] explorer: optimize cache and extractor interface (#1470) * Optimize cache and extractor interface * Update changelog * Run linter formatters * Implement review feedback * Move rulegen extractor construction to tab change * Change rulegen cache construction behavior * Adjust return values for CR, format * Fix mypy errors * Format * Fix merge --------- Co-authored-by: Stephen Eckels --- CHANGELOG.md | 2 ++ capa/ida/plugin/cache.py | 69 ++++++++++++++++++++++------------------ capa/ida/plugin/form.py | 66 ++++++++++++++------------------------ 3 files changed, 63 insertions(+), 74 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69023a2c..8846b14f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -88,12 +88,14 @@ Thanks for all the support, especially to @xusheng6, @captainGeech42, @ggold7046 - nursery/contain-a-thread-local-storage-tls-section-in-dotnet michael.hunhoff@mandiant.com ### Bug Fixes +- extractor: interface of cache modified to prevent extracting file and global features multiple times @stevemk14ebr - extractor: removed '.dynsym' as the library name for ELF imports #1318 @stevemk14ebr - extractor: fix vivisect loop detection corner case #1310 @mr-tz - match: extend OS characteristic to match OS_ANY to all supported OSes #1324 @mike-hunhoff - extractor: fix IDA and vivisect string and bytes features overlap and tests #1327 #1336 @xusheng6 ### capa explorer IDA Pro plugin +- rule generator plugin now loads faster when jumping between functions @stevemk14ebr - fix exception when plugin loaded in IDA hosted under idat #1341 @mike-hunhoff - improve embedded PE detection performance and reduce FP potential #1344 @mike-hunhoff diff --git a/capa/ida/plugin/cache.py b/capa/ida/plugin/cache.py index fd34824e..5226df9f 100644 --- a/capa/ida/plugin/cache.py +++ b/capa/ida/plugin/cache.py @@ -48,7 +48,8 @@ class CapaRuleGenFeatureCacheNode: class CapaRuleGenFeatureCache: - def __init__(self, fh_list: List[FunctionHandle], extractor: CapaExplorerFeatureExtractor): + def __init__(self, extractor: CapaExplorerFeatureExtractor): + self.extractor = extractor self.global_features: FeatureSet = collections.defaultdict(set) self.file_node: CapaRuleGenFeatureCacheNode = CapaRuleGenFeatureCacheNode(None, None) @@ -56,12 +57,11 @@ class CapaRuleGenFeatureCache: self.bb_nodes: Dict[Address, CapaRuleGenFeatureCacheNode] = {} self.insn_nodes: Dict[Address, CapaRuleGenFeatureCacheNode] = {} - self._find_global_features(extractor) - self._find_file_features(extractor) - self._find_function_and_below_features(fh_list, extractor) + self._find_global_features() + self._find_file_features() - def _find_global_features(self, extractor: CapaExplorerFeatureExtractor): - for feature, addr in extractor.extract_global_features(): + def _find_global_features(self): + for feature, addr in self.extractor.extract_global_features(): # not all global features may have virtual addresses. # if not, then at least ensure the feature shows up in the index. # the set of addresses will still be empty. @@ -71,46 +71,45 @@ class CapaRuleGenFeatureCache: if feature not in self.global_features: self.global_features[feature] = set() - def _find_file_features(self, extractor: CapaExplorerFeatureExtractor): + def _find_file_features(self): # not all file features may have virtual addresses. # if not, then at least ensure the feature shows up in the index. # the set of addresses will still be empty. - for feature, addr in extractor.extract_file_features(): + for feature, addr in self.extractor.extract_file_features(): if addr is not None: self.file_node.features[feature].add(addr) else: if feature not in self.file_node.features: self.file_node.features[feature] = set() - def _find_function_and_below_features(self, fh_list: List[FunctionHandle], extractor: CapaExplorerFeatureExtractor): - for fh in fh_list: - f_node: CapaRuleGenFeatureCacheNode = CapaRuleGenFeatureCacheNode(fh, self.file_node) + def _find_function_and_below_features(self, fh: FunctionHandle): + f_node: CapaRuleGenFeatureCacheNode = CapaRuleGenFeatureCacheNode(fh, self.file_node) - # extract basic block and below features - for bbh in extractor.get_basic_blocks(fh): - bb_node: CapaRuleGenFeatureCacheNode = CapaRuleGenFeatureCacheNode(bbh, f_node) + # extract basic block and below features + for bbh in self.extractor.get_basic_blocks(fh): + bb_node: CapaRuleGenFeatureCacheNode = CapaRuleGenFeatureCacheNode(bbh, f_node) - # extract instruction features - for ih in extractor.get_instructions(fh, bbh): - inode: CapaRuleGenFeatureCacheNode = CapaRuleGenFeatureCacheNode(ih, bb_node) + # extract instruction features + for ih in self.extractor.get_instructions(fh, bbh): + inode: CapaRuleGenFeatureCacheNode = CapaRuleGenFeatureCacheNode(ih, bb_node) - for feature, addr in extractor.extract_insn_features(fh, bbh, ih): - inode.features[feature].add(addr) + for feature, addr in self.extractor.extract_insn_features(fh, bbh, ih): + inode.features[feature].add(addr) - self.insn_nodes[inode.address] = inode + self.insn_nodes[inode.address] = inode - # extract basic block features - for feature, addr in extractor.extract_basic_block_features(fh, bbh): - bb_node.features[feature].add(addr) + # extract basic block features + for feature, addr in self.extractor.extract_basic_block_features(fh, bbh): + bb_node.features[feature].add(addr) - # store basic block features in cache and function parent - self.bb_nodes[bb_node.address] = bb_node + # store basic block features in cache and function parent + self.bb_nodes[bb_node.address] = bb_node - # extract function features - for feature, addr in extractor.extract_function_features(fh): - f_node.features[feature].add(addr) + # extract function features + for feature, addr in self.extractor.extract_function_features(fh): + f_node.features[feature].add(addr) - self.func_nodes[f_node.address] = f_node + self.func_nodes[f_node.address] = f_node def _find_instruction_capabilities( self, ruleset: RuleSet, insn: CapaRuleGenFeatureCacheNode @@ -155,7 +154,7 @@ class CapaRuleGenFeatureCache: def find_code_capabilities( self, ruleset: RuleSet, fh: FunctionHandle ) -> Tuple[FeatureSet, MatchResults, MatchResults, MatchResults]: - f_node: Optional[CapaRuleGenFeatureCacheNode] = self.func_nodes.get(fh.address, None) + f_node: Optional[CapaRuleGenFeatureCacheNode] = self._get_cached_func_node(fh) if f_node is None: return {}, {}, {}, {} @@ -195,8 +194,16 @@ class CapaRuleGenFeatureCache: _, matches = ruleset.match(Scope.FILE, features, NO_ADDRESS) return features, matches - def get_all_function_features(self, fh: FunctionHandle) -> FeatureSet: + def _get_cached_func_node(self, fh: FunctionHandle) -> Optional[CapaRuleGenFeatureCacheNode]: f_node: Optional[CapaRuleGenFeatureCacheNode] = self.func_nodes.get(fh.address, None) + if f_node is None: + # function is not in our cache, do extraction now + self._find_function_and_below_features(fh) + f_node = self.func_nodes.get(fh.address, None) + return f_node + + def get_all_function_features(self, fh: FunctionHandle) -> FeatureSet: + f_node: Optional[CapaRuleGenFeatureCacheNode] = self._get_cached_func_node(fh) if f_node is None: return {} diff --git a/capa/ida/plugin/form.py b/capa/ida/plugin/form.py index 72b33a66..07fbe69f 100644 --- a/capa/ida/plugin/form.py +++ b/capa/ida/plugin/form.py @@ -192,8 +192,10 @@ class CapaExplorerForm(idaapi.PluginForm): # caches used to speed up capa explorer analysis - these must be init to None self.resdoc_cache: Optional[capa.render.result_document.ResultDocument] = None self.program_analysis_ruleset_cache: Optional[capa.rules.RuleSet] = None - self.rulegen_ruleset_cache: Optional[capa.rules.RuleSet] = None + self.feature_extractor: Optional[CapaExplorerFeatureExtractor] = None + self.rulegen_feature_extractor: Optional[CapaExplorerFeatureExtractor] = None self.rulegen_feature_cache: Optional[CapaRuleGenFeatureCache] = None + self.rulegen_ruleset_cache: Optional[capa.rules.RuleSet] = None self.rulegen_current_function: Optional[FunctionHandle] = None # models @@ -727,13 +729,11 @@ class CapaExplorerForm(idaapi.PluginForm): update_wait_box(f"{text} ({self.process_count} of {self.process_total})") self.process_count += 1 - update_wait_box("initializing feature extractor") - try: - extractor = CapaExplorerFeatureExtractor() - extractor.indicator.progress.connect(slot_progress_feature_extraction) + self.feature_extractor = CapaExplorerFeatureExtractor() + self.feature_extractor.indicator.progress.connect(slot_progress_feature_extraction) except Exception as e: - logger.error("Failed to initialize feature extractor (error: %s).", e, exc_info=True) + logger.error("Failed to initialize feature extractor (error: %s)", e, exc_info=True) return False if ida_kernwin.user_cancelled(): @@ -743,7 +743,7 @@ class CapaExplorerForm(idaapi.PluginForm): update_wait_box("calculating analysis") try: - self.process_total += len(tuple(extractor.get_functions())) + self.process_total += len(tuple(self.feature_extractor.get_functions())) except Exception as e: logger.error("Failed to calculate analysis (error: %s).", e, exc_info=True) return False @@ -770,12 +770,13 @@ class CapaExplorerForm(idaapi.PluginForm): try: meta = capa.ida.helpers.collect_metadata([settings.user[CAPA_SETTINGS_RULE_PATH]]) - capabilities, counts = capa.main.find_capabilities(ruleset, extractor, disable_progress=True) + capabilities, counts = capa.main.find_capabilities( + ruleset, self.feature_extractor, disable_progress=True + ) meta.analysis.feature_counts = counts["feature_counts"] meta.analysis.library_functions = counts["library_functions"] - meta.analysis.layout = capa.main.compute_layout(ruleset, extractor, capabilities) - + meta.analysis.layout = capa.main.compute_layout(ruleset, self.feature_extractor, capabilities) except UserCancelledError: logger.info("User cancelled analysis.") return False @@ -978,26 +979,21 @@ class CapaExplorerForm(idaapi.PluginForm): # so we'll work with a local copy of the ruleset. ruleset = copy.deepcopy(self.rulegen_ruleset_cache) - # clear feature cache - if self.rulegen_feature_cache is not None: - self.rulegen_feature_cache = None - # clear cached function if self.rulegen_current_function is not None: self.rulegen_current_function = None - if ida_kernwin.user_cancelled(): - logger.info("User cancelled analysis.") - return False - - update_wait_box("Initializing feature extractor") - - try: - # must use extractor to get function, as capa analysis requires casted object - extractor = CapaExplorerFeatureExtractor() - except Exception as e: - logger.error("Failed to initialize feature extractor (error: %s)", e, exc_info=True) - return False + # these are init once objects, create on tab change + if self.rulegen_feature_cache is None or self.rulegen_feature_extractor is None: + try: + update_wait_box("performing one-time file analysis") + self.rulegen_feature_extractor = CapaExplorerFeatureExtractor() + self.rulegen_feature_cache = CapaRuleGenFeatureCache(self.rulegen_feature_extractor) + except Exception as e: + logger.error("Failed to initialize feature extractor (error: %s)", e, exc_info=True) + return False + else: + logger.info("Reusing prior rulegen cache") if ida_kernwin.user_cancelled(): logger.info("User cancelled analysis.") @@ -1009,7 +1005,7 @@ class CapaExplorerForm(idaapi.PluginForm): try: f = idaapi.get_func(idaapi.get_screen_ea()) if f is not None: - self.rulegen_current_function = extractor.get_function(f.start_ea) + self.rulegen_current_function = self.rulegen_feature_extractor.get_function(f.start_ea) except Exception as e: logger.error("Failed to resolve function at address 0x%X (error: %s)", f.start_ea, e, exc_info=True) return False @@ -1018,21 +1014,6 @@ class CapaExplorerForm(idaapi.PluginForm): logger.info("User cancelled analysis.") return False - # extract features - try: - fh_list: List[FunctionHandle] = [] - if self.rulegen_current_function is not None: - fh_list.append(self.rulegen_current_function) - - self.rulegen_feature_cache = CapaRuleGenFeatureCache(fh_list, extractor) - except Exception as e: - logger.error("Failed to extract features (error: %s)", e, exc_info=True) - return False - - if ida_kernwin.user_cancelled(): - logger.info("User cancelled analysis.") - return False - update_wait_box("generating function rule matches") all_function_features: FeatureSet = collections.defaultdict(set) @@ -1264,7 +1245,6 @@ class CapaExplorerForm(idaapi.PluginForm): elif index == 1: self.set_view_status_label(self.view_status_label_rulegen_cache) self.view_status_label_analysis_cache = status_prev - self.view_reset_button.setText("Clear") def slot_rulegen_editor_update(self): From 2a047073e93bea4e82c314e0b5d0b3c8b024ed03 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Wed, 14 Jun 2023 09:58:33 +0100 Subject: [PATCH 33/82] remove redundant types Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 32911d39..8dd3cdf7 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -8,7 +8,7 @@ import abc import dataclasses -from typing import Any, Dict, Tuple, Union, Iterator, TextIO, BinaryIO +from typing import Any, Dict, Tuple, Union, Iterator from dataclasses import dataclass import capa.features.address From dc371580a53dc0e4cf446c0a1d2cd4af20238a65 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 14 Jun 2023 10:58:50 +0200 Subject: [PATCH 34/82] Update capa/features/extractors/base_extractor.py --- capa/features/extractors/base_extractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 8dd3cdf7..a9a06d3b 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -270,7 +270,6 @@ class ProcessHandle: reference to a process extracted by the sandbox. Attributes: - ppid: parent process id pid: process id inner: sandbox-specific data """ From 6c58e26f14d6d7d06d7fc83a7cc559d32048733b Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 14 Jun 2023 10:58:59 +0200 Subject: [PATCH 35/82] Update capa/features/extractors/base_extractor.py --- capa/features/extractors/base_extractor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index a9a06d3b..e4d61bc2 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -305,10 +305,7 @@ class DynamicExtractor(FeatureExtractor): @abc.abstractmethod def get_processes(self) -> Iterator[ProcessHandle]: """ - Yields all the child-processes of a parent one. - - Attributes: - ph: parent process + Enumerate processes in the trace. """ raise NotImplementedError() From e7115c7316d70f5b1c810c6cac57daf06a80f698 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 14 Jun 2023 10:59:07 +0200 Subject: [PATCH 36/82] Update capa/features/extractors/base_extractor.py --- capa/features/extractors/base_extractor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index e4d61bc2..cc488fa3 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -322,10 +322,7 @@ class DynamicExtractor(FeatureExtractor): @abc.abstractmethod def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]: """ - Yields all the threads that a process created. - - Attributes: - ph: parent process + Enumerate threads in the given process. """ raise NotImplementedError() From d9d9d98ea0e98f40e88dcf08e4f994a982b61ae2 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 14 Jun 2023 22:45:12 +0100 Subject: [PATCH 37/82] update the Registry, Filename, and Mutex classes --- capa/features/common.py | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index 8318dee5..4084994d 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -273,40 +273,15 @@ class _MatchedSubstring(Substring): class Registry(String): - # todo: add a way to tell whether this registry key was created, accessed, or deleted. - def __init__(self, value: str, description=None): - super().__init__(value, description) - - def __eq__(self, other): - # Registry instance is in a ruleset - if isinstance(other, Registry): - return super().__eq__(other) - return False + pass class Filename(String): - # todo: add a way to tell whether this file was created, accessed, or deleted. - def __init__(self, value: str, description=None): - super().__init__(value, description) - - def __eq__(self, other): - # Mutex instance is in a ruleset - if isinstance(other, Filename): - return super().__eq__(other) - return False + pass class Mutex(String): - # todo: add a way to tell whether this mutex was created or used - def __init__(self, value: str, description=None): - super().__init__(value, description) - - def __eq__(self, other): - # Mutex instance is in a ruleset - if isinstance(other, Mutex): - return super().__eq__(other) - return False - + pass class Regex(String): def __init__(self, value: str, description=None): From 91f1d4132419ced2d819118564e32606a449c294 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 14 Jun 2023 22:57:41 +0100 Subject: [PATCH 38/82] extract registry keys, files, and mutexes from the sample --- capa/features/extractors/cape/extractor.py | 2 +- capa/features/extractors/cape/file.py | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index 1d3e37c1..a37b9d4c 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -57,11 +57,11 @@ class CapeExtractor(DynamicExtractor): format_ = list(static.keys())[0] static = static[format_] static.update(report["target"]) + static.update(report["behavior"].pop("summary")) static.update({"strings": report["strings"]}) static.update({"format": format_}) behavior = report.pop("behavior") - behavior.update(behavior.pop("summary")) behavior["network"] = report.pop("network") return cls(static, behavior) \ No newline at end of file diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index 00ea597f..03ae992a 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -9,7 +9,7 @@ import logging from typing import Any, Dict, List, Tuple, Iterator -from capa.features.common import Feature, String +from capa.features.common import Feature, String, Registry, Filename, Mutex from capa.features.file import Section, Import, Export, FunctionName from capa.features.address import Address, AbsoluteVirtualAddress, NO_ADDRESS @@ -53,6 +53,21 @@ def extract_file_strings(static: Dict) -> Iterator[Tuple[Feature, Address]]: yield String(string_), NO_ADDRESS +def extract_used_regkeys(static: Dict) -> Iterator[Tuple[Feature, Address]]: + for regkey in static["keys"]: + yield Registry(regkey), NO_ADDRESS + + +def extract_used_files(static: Dict) -> Iterator[Tuple[Feature, Address]]: + for filename in static["files"]: + yield Filename(filename), NO_ADDRESS + + +def extract_used_mutexes(static: Dict) -> Iterator[Tuple[Feature, Address]]: + for mutex in static["mutexes"]: + yield Mutex(mutex), NO_ADDRESS + + def extract_features(static: Dict) -> Iterator[Tuple[Feature, Address]]: for handler in FILE_HANDLERS: for feature, addr in handler(static): @@ -65,4 +80,7 @@ FILE_HANDLERS = ( extract_section_names, extract_function_names, extract_file_strings, + extract_used_regkeys, + extract_used_files, + extract_used_mutexes, ) \ No newline at end of file From 0cf728b7e1f8467b32f97557a8975ba9000463d3 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Thu, 15 Jun 2023 12:28:08 +0100 Subject: [PATCH 39/82] global_.py: update typo in yielded OS name Co-authored-by: Willi Ballenthin --- capa/features/extractors/cape/global_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/cape/global_.py b/capa/features/extractors/cape/global_.py index a6621f6a..bc9f2f49 100644 --- a/capa/features/extractors/cape/global_.py +++ b/capa/features/extractors/cape/global_.py @@ -36,7 +36,7 @@ def guess_elf_os(file_output) -> Iterator[Tuple[Feature, Address]]: return OS(OS_LINUX), NO_ADDRESS elif "Hurd" in file_output: return OS("hurd"), NO_ADDRESS - elif "Solairs" in file_output: + elif "Solaris" in file_output: return OS("solaris"), NO_ADDRESS elif "kFreeBSD" in file_output: return OS("freebsd"), NO_ADDRESS From 865616284f8ce15ef99ed8c108649d1722a1ba34 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Thu, 15 Jun 2023 12:33:22 +0100 Subject: [PATCH 40/82] cape/thread.py: remove yielding argument features Co-authored-by: Willi Ballenthin --- capa/features/extractors/cape/thread.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/capa/features/extractors/cape/thread.py b/capa/features/extractors/cape/thread.py index def3ccf0..c5b7c025 100644 --- a/capa/features/extractors/cape/thread.py +++ b/capa/features/extractors/cape/thread.py @@ -43,17 +43,6 @@ def extract_call_features(behavior: Dict, ph:ProcessHandle, th: ThreadHandle) -> continue yield Number(int(call["return"], 16)), int(call["caller"], 16) yield API(call["api"]), int(call["caller"], 16) - for arg in call["arguments"]: - if arg["value"].isdecimal(): - yield Number(int(arg["value"])), int(call["caller"], 16) - continue - try: - # argument could be in hexadecimal - yield Number(int(arg["value"], 16)), int(call["caller"], 16) - except: - if arg["value"]: - # argument is a non-empty string - yield String(arg["value"]), int(call["caller"], 16) def extract_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: From 7e51e030434bbcb460f387bde2259f7089f13f16 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 15 Jun 2023 12:43:39 +0100 Subject: [PATCH 41/82] cape/file.py: remove String, Filename, and Mutex features --- capa/features/extractors/cape/file.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index 03ae992a..f046c035 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -9,7 +9,7 @@ import logging from typing import Any, Dict, List, Tuple, Iterator -from capa.features.common import Feature, String, Registry, Filename, Mutex +from capa.features.common import Feature, String from capa.features.file import Section, Import, Export, FunctionName from capa.features.address import Address, AbsoluteVirtualAddress, NO_ADDRESS @@ -55,17 +55,17 @@ def extract_file_strings(static: Dict) -> Iterator[Tuple[Feature, Address]]: def extract_used_regkeys(static: Dict) -> Iterator[Tuple[Feature, Address]]: for regkey in static["keys"]: - yield Registry(regkey), NO_ADDRESS + yield String(regkey), NO_ADDRESS def extract_used_files(static: Dict) -> Iterator[Tuple[Feature, Address]]: for filename in static["files"]: - yield Filename(filename), NO_ADDRESS + yield String(filename), NO_ADDRESS def extract_used_mutexes(static: Dict) -> Iterator[Tuple[Feature, Address]]: for mutex in static["mutexes"]: - yield Mutex(mutex), NO_ADDRESS + yield String(mutex), NO_ADDRESS def extract_features(static: Dict) -> Iterator[Tuple[Feature, Address]]: From 22640eb9008896c87805c219250049dd79f2e594 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 15 Jun 2023 12:44:57 +0100 Subject: [PATCH 42/82] cape/file.py: remove FunctionName feature extraction for imported functions --- capa/features/extractors/cape/file.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index f046c035..c67a52a9 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -38,16 +38,6 @@ def extract_section_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: yield Section(name), address -def extract_function_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: - """ - extract the names of imported functions. - """ - for library in static["imports"]: - for function in library["imports"]: - name, address = function["name"], int(function["address"], 16) - yield FunctionName(name), AbsoluteVirtualAddress(address) - - def extract_file_strings(static: Dict) -> Iterator[Tuple[Feature, Address]]: for string_ in static["strings"]: yield String(string_), NO_ADDRESS From e1535dd5741e3075c34a4446ed942db8b8813a56 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 15 Jun 2023 13:17:07 +0100 Subject: [PATCH 43/82] remove Registry, Filename, and mutex features --- capa/features/common.py | 11 ----------- capa/features/extractors/cape/file.py | 1 - 2 files changed, 12 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index 4084994d..5060ebaa 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -272,17 +272,6 @@ class _MatchedSubstring(Substring): return f'substring("{self.value}", matches = {matches})' -class Registry(String): - pass - - -class Filename(String): - pass - - -class Mutex(String): - pass - class Regex(String): def __init__(self, value: str, description=None): super().__init__(value, description=description) diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index c67a52a9..3aa344a4 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -68,7 +68,6 @@ FILE_HANDLERS = ( extract_import_names, extract_export_names, extract_section_names, - extract_function_names, extract_file_strings, extract_used_regkeys, extract_used_files, From dbad921fa52d79b08e261f3de86848e7b8265dab Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 15 Jun 2023 13:21:17 +0100 Subject: [PATCH 44/82] code style changes --- capa/features/extractors/base_extractor.py | 5 +++-- capa/features/extractors/cape/extractor.py | 12 +++++------- capa/features/extractors/cape/file.py | 9 ++++----- capa/features/extractors/cape/global_.py | 17 ++++++++--------- capa/features/extractors/cape/process.py | 18 ++++++++---------- capa/features/extractors/cape/thread.py | 15 ++++++--------- capa/features/insn.py | 12 ++++++------ 7 files changed, 40 insertions(+), 48 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index cc488fa3..3916b8b9 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -296,12 +296,13 @@ class DynamicExtractor(FeatureExtractor): """ DynamicExtractor defines the interface for fetching features from a sandbox' analysis of a sample. - Features are grouped mainly into threads that alongside their meta-features are also grouped into - processes (that also have their own features). Other scopes (such as function and file) may also apply + Features are grouped mainly into threads that alongside their meta-features are also grouped into + processes (that also have their own features). Other scopes (such as function and file) may also apply for a specific sandbox. This class is not instantiated directly; it is the base class for other implementations. """ + @abc.abstractmethod def get_processes(self) -> Iterator[ProcessHandle]: """ diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index a37b9d4c..fd5bcafd 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -9,13 +9,13 @@ import logging from typing import Dict, Tuple, Iterator -import capa.features.extractors.cape.global_ -import capa.features.extractors.cape.process import capa.features.extractors.cape.file import capa.features.extractors.cape.thread +import capa.features.extractors.cape.global_ +import capa.features.extractors.cape.process from capa.features.common import Feature from capa.features.address import Address -from capa.features.extractors.base_extractor import ProcessHandle, ThreadHandle, DynamicExtractor +from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle, DynamicExtractor logger = logging.getLogger(__name__) @@ -28,13 +28,12 @@ class CapeExtractor(DynamicExtractor): self.global_features = capa.features.extractors.cape.global_.extract_features(self.static) - def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from self.global_features def get_file_features(self) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.cape.file.extract_features(self.static) - + def get_processes(self) -> Iterator[ProcessHandle]: yield from capa.features.extractors.cape.process.get_processes(self.behavior) @@ -47,7 +46,6 @@ class CapeExtractor(DynamicExtractor): def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.cape.thread.extract_features(self.behavior, ph, th) - @classmethod def from_report(cls, report: Dict) -> "DynamicExtractor": # todo: @@ -64,4 +62,4 @@ class CapeExtractor(DynamicExtractor): behavior = report.pop("behavior") behavior["network"] = report.pop("network") - return cls(static, behavior) \ No newline at end of file + return cls(static, behavior) diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index 3aa344a4..b6f60b3b 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -9,10 +9,9 @@ import logging from typing import Any, Dict, List, Tuple, Iterator -from capa.features.common import Feature, String -from capa.features.file import Section, Import, Export, FunctionName -from capa.features.address import Address, AbsoluteVirtualAddress, NO_ADDRESS - +from capa.features.file import Export, Import, Section, FunctionName +from capa.features.common import String, Feature +from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress logger = logging.getLogger(__name__) @@ -72,4 +71,4 @@ FILE_HANDLERS = ( extract_used_regkeys, extract_used_files, extract_used_mutexes, -) \ No newline at end of file +) diff --git a/capa/features/extractors/cape/global_.py b/capa/features/extractors/cape/global_.py index bc9f2f49..6479f109 100644 --- a/capa/features/extractors/cape/global_.py +++ b/capa/features/extractors/cape/global_.py @@ -9,23 +9,22 @@ import logging from typing import Tuple, Iterator -from capa.features.address import Address, NO_ADDRESS from capa.features.common import ( OS, OS_ANY, - ARCH_I386, - ARCH_AMD64, ARCH_ANY, - FORMAT_PE, - FORMAT_ELF, - FORMAT_UNKNOWN, - OS_WINDOWS, OS_LINUX, + ARCH_I386, + FORMAT_PE, + ARCH_AMD64, + FORMAT_ELF, + OS_WINDOWS, + FORMAT_UNKNOWN, Arch, Format, Feature, ) - +from capa.features.address import NO_ADDRESS, Address logger = logging.getLogger(__name__) @@ -90,4 +89,4 @@ GLOBAL_HANDLER = ( extract_arch, extract_format, extract_os, -) \ No newline at end of file +) diff --git a/capa/features/extractors/cape/process.py b/capa/features/extractors/cape/process.py index 8f91521b..d36dae40 100644 --- a/capa/features/extractors/cape/process.py +++ b/capa/features/extractors/cape/process.py @@ -9,13 +9,13 @@ import logging from typing import Any, Dict, List, Tuple, Iterator -import capa.features.extractors.cape.global_ -import capa.features.extractors.cape.process import capa.features.extractors.cape.file import capa.features.extractors.cape.thread -from capa.features.common import Feature, String -from capa.features.address import Address, AbsoluteVirtualAddress, NO_ADDRESS -from capa.features.extractors.base_extractor import ProcessHandle, ThreadHandle, DynamicExtractor +import capa.features.extractors.cape.global_ +import capa.features.extractors.cape.process +from capa.features.common import String, Feature +from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress +from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle, DynamicExtractor logger = logging.getLogger(__name__) @@ -54,8 +54,8 @@ def extract_environ_strings(behavior: Dict, ph: ProcessHandle) -> Iterator[Tuple if not environ: return - - for (variable, value) in environ.items(): + + for variable, value in environ.items(): if value: yield String(value), NO_ADDRESS @@ -66,6 +66,4 @@ def extract_features(behavior: Dict, ph: ProcessHandle) -> Iterator[Tuple[Featur yield feature, addr -PROCESS_HANDLERS = ( - extract_environ_strings -) \ No newline at end of file +PROCESS_HANDLERS = extract_environ_strings diff --git a/capa/features/extractors/cape/thread.py b/capa/features/extractors/cape/thread.py index c5b7c025..9a4438d2 100644 --- a/capa/features/extractors/cape/thread.py +++ b/capa/features/extractors/cape/thread.py @@ -9,16 +9,15 @@ import logging from typing import Any, Dict, List, Tuple, Iterator -from capa.features.common import Feature, String from capa.features.insn import API, Number +from capa.features.common import String, Feature from capa.features.address import Address -from capa.features.extractors.base_extractor import ProcessHandle, ThreadHandle - +from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle logger = logging.getLogger(__name__) -def extract_call_features(behavior: Dict, ph:ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: +def extract_call_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: """ this method goes through the specified thread's call trace, and extracts all possible features such as: API, Number (for arguments), String (for arguments). @@ -32,10 +31,10 @@ def extract_call_features(behavior: Dict, ph:ProcessHandle, th: ThreadHandle) -> Feature, address; where Feature is either: API, Number, or String. """ - calls:List[Dict] = None + calls: List[Dict] = None for process in behavior["processes"]: if ph.pid == process["process_id"] and ph.inner["ppid"] == process["parent_id"]: - calls:List[Dict] = process + calls: List[Dict] = process tid = str(th.tid) for call in calls: @@ -51,6 +50,4 @@ def extract_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) -> Ite yield feature, addr -THREAD_HANDLERS = ( - extract_call_features, -) \ No newline at end of file +THREAD_HANDLERS = (extract_call_features,) diff --git a/capa/features/insn.py b/capa/features/insn.py index 96396f6d..1e977e5a 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -6,7 +6,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. import abc -from typing import Tuple, Union, Optional, Dict +from typing import Dict, Tuple, Union, Optional import capa.helpers from capa.features.common import VALID_FEATURE_ACCESS, Feature @@ -41,8 +41,8 @@ class API(Feature): def __eq__(self, other): if not isinstance(other, API): return False - - assert(isinstance(other, API)) + + assert isinstance(other, API) if {} in (self.args, other.args) or False in (self.ret, other.ret): # Legacy API feature return super().__eq__(other) @@ -64,12 +64,12 @@ class API(Feature): match = re.findall(r"(.*)\((.*)\)", match[0][0]) if len(match[0]) == 2: - args = (match[0][1]+", ").split(", ") + args = (match[0][1] + ", ").split(", ") map(lambda x: {f"arg{x[0]}": x[1]}, enumerate(args)) args = [{} | arg for arg in args][0] - + return match[0][0], args, ret - + class _AccessFeature(Feature, abc.ABC): # superclass: don't use directly From d6fa832d83f90da4c507d8e24c9d46e46e0cb3fe Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 13:50:46 +0100 Subject: [PATCH 45/82] cape: move get_processes() method to file scope --- capa/features/extractors/cape/extractor.py | 7 ++----- capa/features/extractors/cape/file.py | 14 ++++++++++++++ capa/features/extractors/cape/process.py | 3 +-- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index fd5bcafd..01836fee 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -5,7 +5,6 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. - import logging from typing import Dict, Tuple, Iterator @@ -35,7 +34,7 @@ class CapeExtractor(DynamicExtractor): yield from capa.features.extractors.cape.file.extract_features(self.static) def get_processes(self) -> Iterator[ProcessHandle]: - yield from capa.features.extractors.cape.process.get_processes(self.behavior) + yield from capa.features.extractors.cape.file.get_processes(self.behavior) def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.cape.process.extract_features(self.behavior, ph) @@ -48,14 +47,12 @@ class CapeExtractor(DynamicExtractor): @classmethod def from_report(cls, report: Dict) -> "DynamicExtractor": - # todo: - # 1. make the information extraction code more elegant - # 2. filter out redundant cape features in an efficient way static = report["static"] format_ = list(static.keys())[0] static = static[format_] static.update(report["target"]) static.update(report["behavior"].pop("summary")) + static.update({"processtree": report["behavior"]["processtree"]}) static.update({"strings": report["strings"]}) static.update({"format": format_}) diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index b6f60b3b..12caad2b 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -12,10 +12,24 @@ from typing import Any, Dict, List, Tuple, Iterator from capa.features.file import Export, Import, Section, FunctionName from capa.features.common import String, Feature from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress +from capa.features.extractors.base_extractor import ProcessHandle logger = logging.getLogger(__name__) +def get_processes(static: Dict) -> Iterator[ProcessHandle]: + """ + get all the created processes for a sample + """ + def rec(process): + inner: Dict[str, str] = {"name": process["name"], "ppid": process["parent_id"]} + yield ProcessHandle(pid=process["pid"], inner=inner) + for child in process["children"]: + rec(child) + + yield from rec(static["processtree"]) + + def extract_import_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: """ extract the names of imported library files, for example: USER32.dll diff --git a/capa/features/extractors/cape/process.py b/capa/features/extractors/cape/process.py index d36dae40..efb11299 100644 --- a/capa/features/extractors/cape/process.py +++ b/capa/features/extractors/cape/process.py @@ -5,7 +5,6 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. - import logging from typing import Any, Dict, List, Tuple, Iterator @@ -66,4 +65,4 @@ def extract_features(behavior: Dict, ph: ProcessHandle) -> Iterator[Tuple[Featur yield feature, addr -PROCESS_HANDLERS = extract_environ_strings +PROCESS_HANDLERS = (extract_environ_strings,) From a04512d7b8ebe0a32f97bd340846ede0969ee048 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 16:43:54 +0100 Subject: [PATCH 46/82] add unit tests for the cape feature extractor --- tests/fixtures.py | 145 +++++++++++++++++++++++++++++++++++- tests/test_cape_features.py | 26 +++++++ 2 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 tests/test_cape_features.py diff --git a/tests/fixtures.py b/tests/fixtures.py index 84e40209..ddb30d6a 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -41,7 +41,7 @@ from capa.features.common import ( FeatureAccess, ) from capa.features.address import Address -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, ProcessHandle, ThreadHandle from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor CD = os.path.dirname(__file__) @@ -183,6 +183,18 @@ def get_binja_extractor(path): return extractor +@lru_cache(maxsize=1) +def get_cape_extractor(path): + from capa.features.extractors.cape.extractor import CapeExtractor + import json + + with open(path) as report_file: + report = report_file.read() + report = json.loads(report) + + extractor = CapeExtractor.from_report(report) + return extractor + def extract_global_features(extractor): features = collections.defaultdict(set) for feature, va in extractor.extract_global_features(): @@ -198,6 +210,23 @@ def extract_file_features(extractor): return features +def extract_process_features(extractor, ph): + features = collections.defaultdict(set) + for thread in extractor.get_threads(ph): + for feature, va in extractor.extract_thread_features(ph, thread): + features[feature].add(va) + for feature, va in extractor.extract_process_features(ph): + features[feature].add(va) + return features + + +def extract_thread_features(extractor, ph, th): + features = collections.defaultdict(set) + for feature, va in extractor.extract_thread_features(ph, th): + features[feature].add(va) + return features + + # f may not be hashable (e.g. ida func_t) so cannot @lru_cache this def extract_function_features(extractor, fh): features = collections.defaultdict(set) @@ -311,6 +340,8 @@ def get_data_path_by_name(name): return os.path.join(CD, "data", "294b8db1f2702b60fb2e42fdc50c2cee6a5046112da9a5703a548a4fa50477bc.elf_") elif name.startswith("2bf18d"): return os.path.join(CD, "data", "2bf18d0403677378adad9001b1243211.elf_") + elif name.startswith("02179f"): + return os.path.join(CD, "dynamic_02179f3ba93663074740b5c0d283bae2.json") else: raise ValueError(f"unexpected sample fixture: {name}") @@ -384,6 +415,20 @@ def sample(request): return resolve_sample(request.param) +def get_process(extractor, ppid: int, pid: int) -> ProcessHandle: + for ph in extractor.get_processes(): + if ph.inner["ppid"] == ppid and ph.pid == pid: + return ProcessHandle(pid, {"ppid": ppid}) + raise ValueError("process not found") + + +def get_thread(extractor, ph: ProcessHandle, tid: int) -> ThreadHandle: + for th in extractor.get_processes(ph): + if th.tid == tid: + return ThreadHandle(tid) + raise ValueError("process not found") + + def get_function(extractor, fva: int) -> FunctionHandle: for fh in extractor.get_functions(): if isinstance(extractor, DnfileFeatureExtractor): @@ -491,6 +536,38 @@ def resolve_scope(scope): inner_function.__name__ = scope return inner_function + elif "thread=" in scope: + assert "process=" in scope + pspec, _, tspec = scope.partition(",") + pspec = scope.partition("=")[2].split(",") + assert len(pspec) == 2 + ppid, pid = map(lambda x: int(x), pspec) + tid = int(tspec) + + def inner_thread(extractor): + ph = get_process(extractor, ppid, pid) + th = get_thread(extractor, ph, tid) + features = extract_thread_features(extractor, ph, th) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features + + inner_thread.__name__ = scope + return inner_thread + elif "process=" in scope: + pspec = scope.partition("=")[2].split(",") + assert len(pspec) == 2 + ppid, pid = map(lambda x: int(x), pspec) + + def inner_process(extractor): + ph = get_process(extractor, ppid, pid) + features = extract_process_features(extractor, ph) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features + + inner_process.__name__ = scope + return inner_process else: raise ValueError("unexpected scope fixture") @@ -516,6 +593,72 @@ def parametrize(params, values, **kwargs): return pytest.mark.parametrize(params, values, ids=ids, **kwargs) +DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( + [ + # file/string + ("", "file", capa.features.common.String(""), True), + ("", "file", capa.features.common.String(""), True), + ("", "file", capa.features.common.String(""), True), + ("", "file", capa.features.common.String("makansh menah"), False), + # file/sections + ("", "file", capa.features.file.Section(""), True), + ("", "file", capa.features.file.Section(""), False), + # file/imports + ("", "file", capa.features.file.Import(""), True), + ("", "file", capa.features.file.Import(""), False), + # file/exports + ("", "file", capa.features.file.Export(""), True), + ("", "file", capa.features.file.Export(""), False), + # process/environment variables + ("", "process=()", capa.features.common.String(""), True), + ("", "process=()", capa.features.common.String(""), False), + # thread/api calls + ("", "process=(),thread=", capa.features.insn.API(""), True), + ("", "process=(),thread=", capa.features.insn.API(""), False), + # thread/number call argument + ("", "process=(),thread=", capa.features.insn.Number(""), True), + ("", "process=(),thread=", capa.features.insn.Number(""), False), + # thread/string call argument + ("", "process=(),thread=", capa.features.common.String(""), True), + ("", "process=(),thread=", capa.features.common.String(""), False), + ], + # order tests by (file, item) + # so that our LRU cache is most effective. + key=lambda t: (t[0], t[1]), +) + +DYNAMIC_FEATURE_COUNT_PRESENCE_TESTS = sorted( + [ + # file/string + ("", "file", capa.features.common.String(""), ), + ("", "file", capa.features.common.String("makansh menah"), 0), + # file/sections + ("", "file", capa.features.file.Section(""), 1), + ("", "file", capa.features.file.Section(""), 0), + # file/imports + ("", "file", capa.features.file.Import(""), 1), + ("", "file", capa.features.file.Import(""), 0), + # file/exports + ("", "file", capa.features.file.Export(""), 1), + ("", "file", capa.features.file.Export(""), 0), + # process/environment variables + ("", "process=()", capa.features.common.String(""), 1), + ("", "process=()", capa.features.common.String(""), 0), + # thread/api calls + ("", "process=(),thread=", capa.features.insn.API(""), 1), + ("", "process=(),thread=", capa.features.insn.API(""), 0), + # thread/number call argument + ("", "process=(),thread=", capa.features.insn.Number(""), 1), + ("", "process=(),thread=", capa.features.insn.Number(""), 0), + # thread/string call argument + ("", "process=(),thread=", capa.features.common.String(""), 1), + ("", "process=(),thread=", capa.features.common.String(""), 0), + ], + # order tests by (file, item) + # so that our LRU cache is most effective. + key=lambda t: (t[0], t[1]), +) + FEATURE_PRESENCE_TESTS = sorted( [ # file/characteristic("embedded pe") diff --git a/tests/test_cape_features.py b/tests/test_cape_features.py new file mode 100644 index 00000000..5e50c9ab --- /dev/null +++ b/tests/test_cape_features.py @@ -0,0 +1,26 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import fixtures +from fixtures import * + +@fixtures.parametrize( + "sample,scope,feature,expected", + fixtures.DYNAMIC_FEATURE_PRESENCE_TESTS, + indirect=["sample", "scope"], +) +def test_cape_features(sample, scope, feature, expected): + fixtures.do_test_feature_presence(fixtures.get_cape_extractor, sample, scope, feature, expected) + + +@fixtures.parametrize( + "sample,scope,feature,expected", + fixtures.DYNAMIC_FEATURE_COUNT_TESTS, + indirect=["sample", "scope"], +) +def test_viv_feature_counts(sample, scope, feature, expected): + fixtures.do_test_feature_count(fixtures.get_cape_extractor, sample, scope, feature, expected) From 9458e851c07b29e007bea354558afabcd1be9532 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 16:46:24 +0100 Subject: [PATCH 47/82] update test sample's path --- tests/fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index ddb30d6a..6aaca8f7 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -341,7 +341,7 @@ def get_data_path_by_name(name): elif name.startswith("2bf18d"): return os.path.join(CD, "data", "2bf18d0403677378adad9001b1243211.elf_") elif name.startswith("02179f"): - return os.path.join(CD, "dynamic_02179f3ba93663074740b5c0d283bae2.json") + return os.path.join(CD, "data", "dynamic_02179f3ba93663074740b5c0d283bae2.json") else: raise ValueError(f"unexpected sample fixture: {name}") From 98e7acddf486d9dea894c810fabe68d0604ec1f8 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 16:59:27 +0100 Subject: [PATCH 48/82] fix codestyle issues --- tests/fixtures.py | 22 ++++++++++++++-------- tests/test_cape_features.py | 1 + 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 6aaca8f7..ac8d53ad 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -41,7 +41,7 @@ from capa.features.common import ( FeatureAccess, ) from capa.features.address import Address -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, ProcessHandle, ThreadHandle +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, ThreadHandle, ProcessHandle, FunctionHandle from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor CD = os.path.dirname(__file__) @@ -185,16 +185,18 @@ def get_binja_extractor(path): @lru_cache(maxsize=1) def get_cape_extractor(path): - from capa.features.extractors.cape.extractor import CapeExtractor import json + from capa.features.extractors.cape.extractor import CapeExtractor + with open(path) as report_file: report = report_file.read() report = json.loads(report) - + extractor = CapeExtractor.from_report(report) return extractor + def extract_global_features(extractor): features = collections.defaultdict(set) for feature, va in extractor.extract_global_features(): @@ -616,8 +618,8 @@ DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( ("", "process=(),thread=", capa.features.insn.API(""), True), ("", "process=(),thread=", capa.features.insn.API(""), False), # thread/number call argument - ("", "process=(),thread=", capa.features.insn.Number(""), True), - ("", "process=(),thread=", capa.features.insn.Number(""), False), + ("", "process=(),thread=", capa.features.insn.Number(), True), + ("", "process=(),thread=", capa.features.insn.Number(), False), # thread/string call argument ("", "process=(),thread=", capa.features.common.String(""), True), ("", "process=(),thread=", capa.features.common.String(""), False), @@ -630,7 +632,11 @@ DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( DYNAMIC_FEATURE_COUNT_PRESENCE_TESTS = sorted( [ # file/string - ("", "file", capa.features.common.String(""), ), + ( + "", + "file", + capa.features.common.String(""), + ), ("", "file", capa.features.common.String("makansh menah"), 0), # file/sections ("", "file", capa.features.file.Section(""), 1), @@ -648,8 +654,8 @@ DYNAMIC_FEATURE_COUNT_PRESENCE_TESTS = sorted( ("", "process=(),thread=", capa.features.insn.API(""), 1), ("", "process=(),thread=", capa.features.insn.API(""), 0), # thread/number call argument - ("", "process=(),thread=", capa.features.insn.Number(""), 1), - ("", "process=(),thread=", capa.features.insn.Number(""), 0), + ("", "process=(),thread=", capa.features.insn.Number(), 1), + ("", "process=(),thread=", capa.features.insn.Number(), 0), # thread/string call argument ("", "process=(),thread=", capa.features.common.String(""), 1), ("", "process=(),thread=", capa.features.common.String(""), 0), diff --git a/tests/test_cape_features.py b/tests/test_cape_features.py index 5e50c9ab..d7fae8f9 100644 --- a/tests/test_cape_features.py +++ b/tests/test_cape_features.py @@ -8,6 +8,7 @@ import fixtures from fixtures import * + @fixtures.parametrize( "sample,scope,feature,expected", fixtures.DYNAMIC_FEATURE_PRESENCE_TESTS, From f02178852bd64333adf4cee91b9fdec9a470b004 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 17:01:05 +0100 Subject: [PATCH 49/82] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a736a60..94153c2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### New Features - Utility script to detect feature overlap between new and existing CAPA rules [#1451](https://github.com/mandiant/capa/issues/1451) [@Aayush-Goel-04](https://github.com/aayush-goel-04) +- Add unit tests for the new CAPE extractor @yelhamer ### Breaking Changes - Update Metadata type in capa main [#1411](https://github.com/mandiant/capa/issues/1411) [@Aayush-Goel-04](https://github.com/aayush-goel-04) @manasghandat From 4acdca090d08611890fbc9ebacacb7f27d1400ef Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 17:14:59 +0100 Subject: [PATCH 50/82] bug fixes --- tests/fixtures.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index ac8d53ad..6d3113ff 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -41,7 +41,7 @@ from capa.features.common import ( FeatureAccess, ) from capa.features.address import Address -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, ThreadHandle, ProcessHandle, FunctionHandle +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, ThreadHandle, ProcessHandle from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor CD = os.path.dirname(__file__) @@ -342,7 +342,7 @@ def get_data_path_by_name(name): return os.path.join(CD, "data", "294b8db1f2702b60fb2e42fdc50c2cee6a5046112da9a5703a548a4fa50477bc.elf_") elif name.startswith("2bf18d"): return os.path.join(CD, "data", "2bf18d0403677378adad9001b1243211.elf_") - elif name.startswith("02179f"): + elif name.startswith("dynamic_02179f"): return os.path.join(CD, "data", "dynamic_02179f3ba93663074740b5c0d283bae2.json") else: raise ValueError(f"unexpected sample fixture: {name}") @@ -404,6 +404,8 @@ def get_sample_md5_by_name(name): return "3db3e55b16a7b1b1afb970d5e77c5d98" elif name.startswith("2bf18d"): return "2bf18d0403677378adad9001b1243211" + elif name.startswith("dynamic_02179f"): + return "dynamic_02179f3ba93663074740b5c0d283bae2.json" else: raise ValueError(f"unexpected sample fixture: {name}") @@ -428,7 +430,7 @@ def get_thread(extractor, ph: ProcessHandle, tid: int) -> ThreadHandle: for th in extractor.get_processes(ph): if th.tid == tid: return ThreadHandle(tid) - raise ValueError("process not found") + raise ValueError("thread not found") def get_function(extractor, fva: int) -> FunctionHandle: @@ -539,9 +541,10 @@ def resolve_scope(scope): inner_function.__name__ = scope return inner_function elif "thread=" in scope: + # like `process=(712:935),thread=1002` assert "process=" in scope pspec, _, tspec = scope.partition(",") - pspec = scope.partition("=")[2].split(",") + pspec = scope.partition("=")[2].split(":") assert len(pspec) == 2 ppid, pid = map(lambda x: int(x), pspec) tid = int(tspec) @@ -557,7 +560,8 @@ def resolve_scope(scope): inner_thread.__name__ = scope return inner_thread elif "process=" in scope: - pspec = scope.partition("=")[2].split(",") + # like `process=(712:935)` + pspec = scope.partition("=")[2].split(":") assert len(pspec) == 2 ppid, pid = map(lambda x: int(x), pspec) @@ -601,7 +605,7 @@ DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( ("", "file", capa.features.common.String(""), True), ("", "file", capa.features.common.String(""), True), ("", "file", capa.features.common.String(""), True), - ("", "file", capa.features.common.String("makansh menah"), False), + ("", "file", capa.features.common.String("nope"), False), # file/sections ("", "file", capa.features.file.Section(""), True), ("", "file", capa.features.file.Section(""), False), @@ -637,7 +641,7 @@ DYNAMIC_FEATURE_COUNT_PRESENCE_TESTS = sorted( "file", capa.features.common.String(""), ), - ("", "file", capa.features.common.String("makansh menah"), 0), + ("", "file", capa.features.common.String("nope"), 0), # file/sections ("", "file", capa.features.file.Section(""), 1), ("", "file", capa.features.file.Section(""), 0), From 38596f8d0e61e99fad1cb80701f32cb02d8178aa Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 19:32:56 +0100 Subject: [PATCH 51/82] add features for the QakBot sample --- tests/fixtures.py | 72 ++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 39 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 6d3113ff..9834c7ae 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -41,7 +41,7 @@ from capa.features.common import ( FeatureAccess, ) from capa.features.address import Address -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, ThreadHandle, ProcessHandle +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, ThreadHandle, ProcessHandle, FunctionHandle from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor CD = os.path.dirname(__file__) @@ -602,31 +602,29 @@ def parametrize(params, values, **kwargs): DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( [ # file/string - ("", "file", capa.features.common.String(""), True), - ("", "file", capa.features.common.String(""), True), - ("", "file", capa.features.common.String(""), True), - ("", "file", capa.features.common.String("nope"), False), + ("dynamic_02179f", "file", capa.features.common.String("T_Ba?.BcRJa"), True), + ("dynamic_02179f", "file", capa.features.common.String("GetNamedPipeClientSessionId"), True), + ("dynamic_02179f", "file", capa.features.common.String("nope"), False), # file/sections - ("", "file", capa.features.file.Section(""), True), - ("", "file", capa.features.file.Section(""), False), + ("dynamic_02179f", "file", capa.features.file.Section(".rdata"), True), + ("dynamic_02179f", "file", capa.features.file.Section(".nope"), False), # file/imports - ("", "file", capa.features.file.Import(""), True), - ("", "file", capa.features.file.Import(""), False), + ("dynamic_02179f", "file", capa.features.file.Import("NdrSimpleTypeUnmarshall"), True), + ("dynamic_02179f", "file", capa.features.file.Import("Nope"), False), # file/exports - ("", "file", capa.features.file.Export(""), True), - ("", "file", capa.features.file.Export(""), False), + ("dynamic_02179f", "file", capa.features.file.Export("Nope"), False), # process/environment variables - ("", "process=()", capa.features.common.String(""), True), - ("", "process=()", capa.features.common.String(""), False), + ("dynamic_02179f", "process=(1180:3052)", capa.features.common.String("C:\\Users\\comp\\AppData\\Roaming\\Microsoft\\Jxoqwnx\\jxoqwn.exe"), True), + ("dynamic_02179f", "process=(1180:3052)", capa.features.common.String("nope"), False), # thread/api calls - ("", "process=(),thread=", capa.features.insn.API(""), True), - ("", "process=(),thread=", capa.features.insn.API(""), False), + ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.API("LdrGetProcedureAddress"), True), + ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.API("GetActiveWindow"), False), # thread/number call argument - ("", "process=(),thread=", capa.features.insn.Number(), True), - ("", "process=(),thread=", capa.features.insn.Number(), False), + ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.Number(3071), True), + ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.Number(110173), False), # thread/string call argument - ("", "process=(),thread=", capa.features.common.String(""), True), - ("", "process=(),thread=", capa.features.common.String(""), False), + #("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), + #("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), ], # order tests by (file, item) # so that our LRU cache is most effective. @@ -636,33 +634,29 @@ DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( DYNAMIC_FEATURE_COUNT_PRESENCE_TESTS = sorted( [ # file/string - ( - "", - "file", - capa.features.common.String(""), - ), - ("", "file", capa.features.common.String("nope"), 0), + ("dynamic_02179f", "file", capa.features.common.String("T_Ba?.BcRJa"), True), + ("dynamic_02179f", "file", capa.features.common.String("GetNamedPipeClientSessionId"), True), + ("dynamic_02179f", "file", capa.features.common.String("nope"), False), # file/sections - ("", "file", capa.features.file.Section(""), 1), - ("", "file", capa.features.file.Section(""), 0), + ("dynamic_02179f", "file", capa.features.file.Section(".rdata"), True), + ("dynamic_02179f", "file", capa.features.file.Section(".nope"), False), # file/imports - ("", "file", capa.features.file.Import(""), 1), - ("", "file", capa.features.file.Import(""), 0), + ("dynamic_02179f", "file", capa.features.file.Import("NdrSimpleTypeUnmarshall"), True), + ("dynamic_02179f", "file", capa.features.file.Import("Nope"), False), # file/exports - ("", "file", capa.features.file.Export(""), 1), - ("", "file", capa.features.file.Export(""), 0), + ("dynamic_02179f", "file", capa.features.file.Export("Nope"), False), # process/environment variables - ("", "process=()", capa.features.common.String(""), 1), - ("", "process=()", capa.features.common.String(""), 0), + ("dynamic_02179f", "process=(1180:3052)", capa.features.common.String("C:\\Users\\comp\\AppData\\Roaming\\Microsoft\\Jxoqwnx\\jxoqwn.exe"), True), + ("dynamic_02179f", "process=(1180:3052)", capa.features.common.String("nope"), False), # thread/api calls - ("", "process=(),thread=", capa.features.insn.API(""), 1), - ("", "process=(),thread=", capa.features.insn.API(""), 0), + ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.API("LdrGetProcedureAddress"), True), + ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.API("GetActiveWindow"), False), # thread/number call argument - ("", "process=(),thread=", capa.features.insn.Number(), 1), - ("", "process=(),thread=", capa.features.insn.Number(), 0), + ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.Number(3071), True), + ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.Number(110173), False), # thread/string call argument - ("", "process=(),thread=", capa.features.common.String(""), 1), - ("", "process=(),thread=", capa.features.common.String(""), 0), + #("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), + #("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), ], # order tests by (file, item) # so that our LRU cache is most effective. From 3c8abab574430983ddaa05c245482f46499a91cc Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 23:40:09 +0100 Subject: [PATCH 52/82] fix bugs and refactor code --- capa/features/extractors/cape/extractor.py | 10 +++--- capa/features/extractors/cape/file.py | 21 ++++++----- capa/features/extractors/cape/global_.py | 42 +++++++++++----------- capa/features/extractors/cape/process.py | 20 +++-------- capa/features/extractors/cape/thread.py | 17 ++++++--- capa/features/insn.py | 18 +++++----- 6 files changed, 65 insertions(+), 63 deletions(-) diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index 01836fee..79be0b24 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) class CapeExtractor(DynamicExtractor): - def __init__(self, static: Dict, behavior: Dict, network: Dict): + def __init__(self, static: Dict, behavior: Dict): super().__init__() self.static = static self.behavior = behavior @@ -30,7 +30,7 @@ class CapeExtractor(DynamicExtractor): def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from self.global_features - def get_file_features(self) -> Iterator[Tuple[Feature, Address]]: + def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.cape.file.extract_features(self.static) def get_processes(self) -> Iterator[ProcessHandle]: @@ -39,19 +39,19 @@ class CapeExtractor(DynamicExtractor): def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.cape.process.extract_features(self.behavior, ph) - def get_threads(self, ph: ProcessHandle) -> Iterator[ProcessHandle]: + def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]: yield from capa.features.extractors.cape.process.get_threads(self.behavior, ph) def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.cape.thread.extract_features(self.behavior, ph, th) @classmethod - def from_report(cls, report: Dict) -> "DynamicExtractor": + def from_report(cls, report: Dict) -> "CapeExtractor": static = report["static"] format_ = list(static.keys())[0] static = static[format_] - static.update(report["target"]) static.update(report["behavior"].pop("summary")) + static.update(report["target"]) static.update({"processtree": report["behavior"]["processtree"]}) static.update({"strings": report["strings"]}) static.update({"format": format_}) diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index 12caad2b..fcace6d1 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -7,9 +7,9 @@ # See the License for the specific language governing permissions and limitations under the License. import logging -from typing import Any, Dict, List, Tuple, Iterator +from typing import Dict, Tuple, Iterator -from capa.features.file import Export, Import, Section, FunctionName +from capa.features.file import Export, Import, Section from capa.features.common import String, Feature from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress from capa.features.extractors.base_extractor import ProcessHandle @@ -21,13 +21,15 @@ def get_processes(static: Dict) -> Iterator[ProcessHandle]: """ get all the created processes for a sample """ + def rec(process): inner: Dict[str, str] = {"name": process["name"], "ppid": process["parent_id"]} yield ProcessHandle(pid=process["pid"], inner=inner) for child in process["children"]: - rec(child) + yield from rec(child) - yield from rec(static["processtree"]) + for process in static["processtree"]: + yield from rec(process) def extract_import_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: @@ -35,20 +37,21 @@ def extract_import_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: extract the names of imported library files, for example: USER32.dll """ for library in static["imports"]: - name, address = library["name"], int(library["virtual_address"], 16) - yield Import(name), address + for function in library["imports"]: + name, address = function["name"], int(function["address"], 16) + yield Import(name), AbsoluteVirtualAddress(address) def extract_export_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: for function in static["exports"]: - name, address = function["name"], int(function["virtual_address"], 16) - yield Export(name), address + name, address = function["name"], int(function["address"], 16) + yield Export(name), AbsoluteVirtualAddress(address) def extract_section_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: for section in static["sections"]: name, address = section["name"], int(section["virtual_address"], 16) - yield Section(name), address + yield Section(name), AbsoluteVirtualAddress(address) def extract_file_strings(static: Dict) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/cape/global_.py b/capa/features/extractors/cape/global_.py index 6479f109..70b5d2bf 100644 --- a/capa/features/extractors/cape/global_.py +++ b/capa/features/extractors/cape/global_.py @@ -32,51 +32,51 @@ logger = logging.getLogger(__name__) def guess_elf_os(file_output) -> Iterator[Tuple[Feature, Address]]: # operating systems recognized by the file command: https://github.com/file/file/blob/master/src/readelf.c#L609 if "Linux" in file_output: - return OS(OS_LINUX), NO_ADDRESS + yield OS(OS_LINUX), NO_ADDRESS elif "Hurd" in file_output: - return OS("hurd"), NO_ADDRESS + yield OS("hurd"), NO_ADDRESS elif "Solaris" in file_output: - return OS("solaris"), NO_ADDRESS + yield OS("solaris"), NO_ADDRESS elif "kFreeBSD" in file_output: - return OS("freebsd"), NO_ADDRESS + yield OS("freebsd"), NO_ADDRESS elif "kNetBSD" in file_output: - return OS("netbsd"), NO_ADDRESS + yield OS("netbsd"), NO_ADDRESS else: - return OS(OS_ANY), NO_ADDRESS + yield OS(OS_ANY), NO_ADDRESS def extract_arch(static) -> Iterator[Tuple[Feature, Address]]: - if "Intel 80386" in static["target"]["type"]: - return Arch(ARCH_I386), NO_ADDRESS - elif "x86-64" in static["target"]["type"]: - return Arch(ARCH_AMD64), NO_ADDRESS + if "Intel 80386" in static["file"]["type"]: + yield Arch(ARCH_I386), NO_ADDRESS + elif "x86-64" in static["file"]["type"]: + yield Arch(ARCH_AMD64), NO_ADDRESS else: - return Arch(ARCH_ANY) + yield Arch(ARCH_ANY), NO_ADDRESS def extract_format(static) -> Iterator[Tuple[Feature, Address]]: - if "PE" in static["target"]["type"]: - return Format(FORMAT_PE), NO_ADDRESS - elif "ELF" in static["target"]["type"]: - return Format(FORMAT_ELF), NO_ADDRESS + if "PE" in static["file"]["type"]: + yield Format(FORMAT_PE), NO_ADDRESS + elif "ELF" in static["file"]["type"]: + yield Format(FORMAT_ELF), NO_ADDRESS else: - logger.debug(f"unknown file format, file command output: {static['target']['type']}") - return Format(FORMAT_UNKNOWN), NO_ADDRESS + logger.debug(f"unknown file format, file command output: {static['file']['type']}") + yield Format(FORMAT_UNKNOWN), NO_ADDRESS def extract_os(static) -> Iterator[Tuple[Feature, Address]]: # this variable contains the output of the file command - file_command = static["target"]["type"] + file_command = static["file"]["type"] if "WINDOWS" in file_command: - return OS(OS_WINDOWS), NO_ADDRESS + yield OS(OS_WINDOWS), NO_ADDRESS elif "ELF" in file_command: # implement os guessing from the cape trace - return guess_elf_os(file_command) + yield from guess_elf_os(file_command) else: # the sample is shellcode logger.debug(f"unsupported file format, file command output: {file_command}") - return OS(OS_ANY), NO_ADDRESS + yield OS(OS_ANY), NO_ADDRESS def extract_features(static) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/cape/process.py b/capa/features/extractors/cape/process.py index efb11299..8139e4a3 100644 --- a/capa/features/extractors/cape/process.py +++ b/capa/features/extractors/cape/process.py @@ -19,37 +19,27 @@ from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle, logger = logging.getLogger(__name__) -def get_processes(behavior: Dict) -> Iterator[ProcessHandle]: - """ - get all created processes for a sample - """ - for process in behavior["processes"]: - inner: Dict[str, str] = {"name": process["name"], "ppid": process["parent_id"]} - yield ProcessHandle(pid=process["process_id"], inner=inner) - - -def get_threads(behavior: Dict, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: +def get_threads(behavior: Dict, ph: ProcessHandle) -> Iterator[ThreadHandle]: """ get a thread's child processes """ - threads: List = None for process in behavior["processes"]: if ph.pid == process["process_id"] and ph.inner["ppid"] == process["parent_id"]: - threads = process["threads"] + threads: List = process["threads"] for thread in threads: - yield ThreadHandle(int(thread)) + yield ThreadHandle(int(thread), inner={}) def extract_environ_strings(behavior: Dict, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: """ extract strings from a process' provided environment variables. """ - environ: Dict[str, str] = None + for process in behavior["processes"]: if ph.pid == process["process_id"] and ph.inner["ppid"] == process["parent_id"]: - environ = process["environ"] + environ: Dict[str, str] = process["environ"] if not environ: return diff --git a/capa/features/extractors/cape/thread.py b/capa/features/extractors/cape/thread.py index 9a4438d2..3a1217c9 100644 --- a/capa/features/extractors/cape/thread.py +++ b/capa/features/extractors/cape/thread.py @@ -11,7 +11,7 @@ from typing import Any, Dict, List, Tuple, Iterator from capa.features.insn import API, Number from capa.features.common import String, Feature -from capa.features.address import Address +from capa.features.address import Address, AbsoluteVirtualAddress from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle logger = logging.getLogger(__name__) @@ -31,17 +31,24 @@ def extract_call_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) - Feature, address; where Feature is either: API, Number, or String. """ - calls: List[Dict] = None for process in behavior["processes"]: if ph.pid == process["process_id"] and ph.inner["ppid"] == process["parent_id"]: - calls: List[Dict] = process + calls: List[Dict] = process["calls"] tid = str(th.tid) for call in calls: if call["thread_id"] != tid: continue - yield Number(int(call["return"], 16)), int(call["caller"], 16) - yield API(call["api"]), int(call["caller"], 16) + + caller = int(call["caller"], 16) + caller = AbsoluteVirtualAddress(caller) + for arg in call["arguments"]: + try: + yield Number(int(arg["value"], 16)), caller + except ValueError: + continue + yield Number(int(call["return"], 16)), caller + yield API(call["api"]), caller def extract_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/insn.py b/capa/features/insn.py index 1e977e5a..4f4a78d0 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -25,8 +25,8 @@ class API(Feature): if signature.isidentifier(): # api call is in the legacy format super().__init__(signature, description=description) - self.args = {} - self.ret = False + self.args: Dict[str, str] = {} + self.ret = "" else: # api call is in the strace format and therefore has to be parsed name, self.args, self.ret = self.parse_signature(signature) @@ -43,30 +43,32 @@ class API(Feature): return False assert isinstance(other, API) - if {} in (self.args, other.args) or False in (self.ret, other.ret): + if {} in (self.args, other.args) or "" in (self.ret, other.ret): # Legacy API feature return super().__eq__(other) # API call with arguments return super().__eq__(other) and self.args == other.args and self.ret == other.ret - def parse_signature(self, signature: str) -> Tuple[str, Optional[Dict[str, str]], Optional[str]]: + def parse_signature(self, signature: str) -> Tuple[str, Dict[str, str], str]: # todo: optimize this method and improve the code quality import re - args = ret = False + args: Dict[str, str] = {} + ret = "" match = re.findall(r"(.+\(.*\)) ?=? ?([^=]*)", signature) if not match: - return "", None, None + return "", {}, "" if len(match[0]) == 2: ret = match[0][1] match = re.findall(r"(.*)\((.*)\)", match[0][0]) if len(match[0]) == 2: - args = (match[0][1] + ", ").split(", ") + args_: Dict[str, str] = (match[0][1] + ", ").split(", ") map(lambda x: {f"arg{x[0]}": x[1]}, enumerate(args)) - args = [{} | arg for arg in args][0] + for num, arg in enumerate(args_): + args.update({f"arg {0}": arg}) return match[0][0], args, ret From d4c4a17eb7f208e345dd8013703f91cb4bdfc315 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 23:42:27 +0100 Subject: [PATCH 53/82] bugfixes and add cape sample tests --- tests/fixtures.py | 80 ++++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 9834c7ae..87147eb7 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -343,7 +343,7 @@ def get_data_path_by_name(name): elif name.startswith("2bf18d"): return os.path.join(CD, "data", "2bf18d0403677378adad9001b1243211.elf_") elif name.startswith("dynamic_02179f"): - return os.path.join(CD, "data", "dynamic_02179f3ba93663074740b5c0d283bae2.json") + return os.path.join(CD, "data", "dynamic_02179f3ba93663074740b5c0d283bae2.json_") else: raise ValueError(f"unexpected sample fixture: {name}") @@ -405,7 +405,7 @@ def get_sample_md5_by_name(name): elif name.startswith("2bf18d"): return "2bf18d0403677378adad9001b1243211" elif name.startswith("dynamic_02179f"): - return "dynamic_02179f3ba93663074740b5c0d283bae2.json" + return "dynamic_02179f3ba93663074740b5c0d283bae2.json_" else: raise ValueError(f"unexpected sample fixture: {name}") @@ -427,9 +427,9 @@ def get_process(extractor, ppid: int, pid: int) -> ProcessHandle: def get_thread(extractor, ph: ProcessHandle, tid: int) -> ThreadHandle: - for th in extractor.get_processes(ph): + for th in extractor.get_threads(ph): if th.tid == tid: - return ThreadHandle(tid) + return th raise ValueError("thread not found") @@ -541,13 +541,13 @@ def resolve_scope(scope): inner_function.__name__ = scope return inner_function elif "thread=" in scope: - # like `process=(712:935),thread=1002` + # like `process=(pid:ppid),thread=1002` assert "process=" in scope pspec, _, tspec = scope.partition(",") - pspec = scope.partition("=")[2].split(":") + pspec = pspec.partition("=")[2][1:-1].split(":") assert len(pspec) == 2 - ppid, pid = map(lambda x: int(x), pspec) - tid = int(tspec) + pid, ppid = map(lambda x: int(x), pspec) + tid = int(tspec.partition("=")[2]) def inner_thread(extractor): ph = get_process(extractor, ppid, pid) @@ -560,10 +560,10 @@ def resolve_scope(scope): inner_thread.__name__ = scope return inner_thread elif "process=" in scope: - # like `process=(712:935)` - pspec = scope.partition("=")[2].split(":") + # like `process=(pid:ppid)` + pspec = scope.partition("=")[2][1:-1].split(":") assert len(pspec) == 2 - ppid, pid = map(lambda x: int(x), pspec) + pid, ppid = map(lambda x: int(x), pspec) def inner_process(extractor): ph = get_process(extractor, ppid, pid) @@ -614,49 +614,59 @@ DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( # file/exports ("dynamic_02179f", "file", capa.features.file.Export("Nope"), False), # process/environment variables - ("dynamic_02179f", "process=(1180:3052)", capa.features.common.String("C:\\Users\\comp\\AppData\\Roaming\\Microsoft\\Jxoqwnx\\jxoqwn.exe"), True), + ( + "dynamic_02179f", + "process=(1180:3052)", + capa.features.common.String("C:\\Users\\comp\\AppData\\Roaming\\Microsoft\\Jxoqwnx\\jxoqwn.exe"), + True, + ), ("dynamic_02179f", "process=(1180:3052)", capa.features.common.String("nope"), False), # thread/api calls - ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.API("LdrGetProcedureAddress"), True), - ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.API("GetActiveWindow"), False), + ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.API("NtQueryValueKey"), True), + ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.API("GetActiveWindow"), False), # thread/number call argument - ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.Number(3071), True), - ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.Number(110173), False), + ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.Number(0x000000EC), True), + ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.Number(110173), False), # thread/string call argument - #("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), - #("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), + # ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), + # ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), ], # order tests by (file, item) # so that our LRU cache is most effective. key=lambda t: (t[0], t[1]), ) -DYNAMIC_FEATURE_COUNT_PRESENCE_TESTS = sorted( +DYNAMIC_FEATURE_COUNT_TESTS = sorted( [ # file/string - ("dynamic_02179f", "file", capa.features.common.String("T_Ba?.BcRJa"), True), - ("dynamic_02179f", "file", capa.features.common.String("GetNamedPipeClientSessionId"), True), - ("dynamic_02179f", "file", capa.features.common.String("nope"), False), + ("dynamic_02179f", "file", capa.features.common.String("T_Ba?.BcRJa"), 1), + ("dynamic_02179f", "file", capa.features.common.String("GetNamedPipeClientSessionId"), 1), + ("dynamic_02179f", "file", capa.features.common.String("nope"), 0), # file/sections - ("dynamic_02179f", "file", capa.features.file.Section(".rdata"), True), - ("dynamic_02179f", "file", capa.features.file.Section(".nope"), False), + ("dynamic_02179f", "file", capa.features.file.Section(".rdata"), 1), + ("dynamic_02179f", "file", capa.features.file.Section(".nope"), 0), # file/imports - ("dynamic_02179f", "file", capa.features.file.Import("NdrSimpleTypeUnmarshall"), True), - ("dynamic_02179f", "file", capa.features.file.Import("Nope"), False), + ("dynamic_02179f", "file", capa.features.file.Import("NdrSimpleTypeUnmarshall"), 1), + ("dynamic_02179f", "file", capa.features.file.Import("Nope"), 0), # file/exports - ("dynamic_02179f", "file", capa.features.file.Export("Nope"), False), + ("dynamic_02179f", "file", capa.features.file.Export("Nope"), 0), # process/environment variables - ("dynamic_02179f", "process=(1180:3052)", capa.features.common.String("C:\\Users\\comp\\AppData\\Roaming\\Microsoft\\Jxoqwnx\\jxoqwn.exe"), True), - ("dynamic_02179f", "process=(1180:3052)", capa.features.common.String("nope"), False), + ( + "dynamic_02179f", + "process=(1180:3052)", + capa.features.common.String("C:\\Users\\comp\\AppData\\Roaming\\Microsoft\\Jxoqwnx\\jxoqwn.exe"), + 1, + ), + ("dynamic_02179f", "process=(1180:3052)", capa.features.common.String("nope"), 0), # thread/api calls - ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.API("LdrGetProcedureAddress"), True), - ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.API("GetActiveWindow"), False), + ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.API("NtQueryValueKey"), 5), + ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.API("GetActiveWindow"), 0), # thread/number call argument - ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.Number(3071), True), - ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.insn.Number(110173), False), + ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.Number(0x000000EC), 1), + ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.Number(110173), 0), # thread/string call argument - #("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), - #("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), + # ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), + # ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), ], # order tests by (file, item) # so that our LRU cache is most effective. From 49b77d54777fe965d243145e1e9c0f90e89be34c Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 23:49:19 +0100 Subject: [PATCH 54/82] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8846b14f..7fa58bdd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### New Features - Utility script to detect feature overlap between new and existing CAPA rules [#1451](https://github.com/mandiant/capa/issues/1451) [@Aayush-Goel-04](https://github.com/aayush-goel-04) +- Add a dynamic extractor for the CAPE sandbox @yelhamer [#1535](https://github.com/mandiant/capa/issues/1535) ### Breaking Changes - Update Metadata type in capa main [#1411](https://github.com/mandiant/capa/issues/1411) [@Aayush-Goel-04](https://github.com/aayush-goel-04) @manasghandat From c88f859daed5faa0c333ce29d6b6bd31996a805d Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 23:55:06 +0100 Subject: [PATCH 55/82] removed redundant HBI features --- capa/rules/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 01908790..64fd7e37 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -261,12 +261,6 @@ def parse_feature(key: str): return capa.features.common.StringFactory elif key == "substring": return capa.features.common.Substring - elif key == "registry": - return capa.features.common.Registry - elif key == "filename": - return capa.features.common.Filename - elif key == "mutex": - return capa.features.common.Mutex elif key == "bytes": return capa.features.common.Bytes elif key == "number": From 624151c3f77b0be88897e3d1ec0d06351726bb73 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 23:55:12 +0100 Subject: [PATCH 56/82] Revert "update changelog" This reverts commit 49b77d54777fe965d243145e1e9c0f90e89be34c. --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fa58bdd..8846b14f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,6 @@ ### New Features - Utility script to detect feature overlap between new and existing CAPA rules [#1451](https://github.com/mandiant/capa/issues/1451) [@Aayush-Goel-04](https://github.com/aayush-goel-04) -- Add a dynamic extractor for the CAPE sandbox @yelhamer [#1535](https://github.com/mandiant/capa/issues/1535) ### Breaking Changes - Update Metadata type in capa main [#1411](https://github.com/mandiant/capa/issues/1411) [@Aayush-Goel-04](https://github.com/aayush-goel-04) @manasghandat From 33de609560cd24131ac52b9cb40d7985740c023a Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 23:55:22 +0100 Subject: [PATCH 57/82] Revert "removed redundant HBI features" This reverts commit c88f859daed5faa0c333ce29d6b6bd31996a805d. --- capa/rules/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 64fd7e37..01908790 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -261,6 +261,12 @@ def parse_feature(key: str): return capa.features.common.StringFactory elif key == "substring": return capa.features.common.Substring + elif key == "registry": + return capa.features.common.Registry + elif key == "filename": + return capa.features.common.Filename + elif key == "mutex": + return capa.features.common.Mutex elif key == "bytes": return capa.features.common.Bytes elif key == "number": From ef999ed95478b21bf8f4e0151c259d864d3cbdd0 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 23:56:10 +0100 Subject: [PATCH 58/82] rules/__init__.py: remove redundant HBI features --- capa/rules/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 01908790..64fd7e37 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -261,12 +261,6 @@ def parse_feature(key: str): return capa.features.common.StringFactory elif key == "substring": return capa.features.common.Substring - elif key == "registry": - return capa.features.common.Registry - elif key == "filename": - return capa.features.common.Filename - elif key == "mutex": - return capa.features.common.Mutex elif key == "bytes": return capa.features.common.Bytes elif key == "number": From 8eef210547063630f193c06300cd3deb44448999 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 19 Jun 2023 23:57:51 +0100 Subject: [PATCH 59/82] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8846b14f..57adfe9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### New Features - Utility script to detect feature overlap between new and existing CAPA rules [#1451](https://github.com/mandiant/capa/issues/1451) [@Aayush-Goel-04](https://github.com/aayush-goel-04) +- Add a dynamic feature extractor for the CAPE sandbox @yelhamer [#1535](https://github.com/mandiant/capa/issues/1535) ### Breaking Changes - Update Metadata type in capa main [#1411](https://github.com/mandiant/capa/issues/1411) [@Aayush-Goel-04](https://github.com/aayush-goel-04) @manasghandat From b9a4d72b42f7461b390e8401e0613f8b8428db6f Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 00:12:21 +0100 Subject: [PATCH 60/82] cape/file.py: add usage of helpers.generate_symbols() --- capa/features/extractors/cape/file.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index fcace6d1..436e6bd0 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -13,6 +13,7 @@ from capa.features.file import Export, Import, Section from capa.features.common import String, Feature from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress from capa.features.extractors.base_extractor import ProcessHandle +from capa.features.extractors.helpers import generate_symbols logger = logging.getLogger(__name__) @@ -38,8 +39,9 @@ def extract_import_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: """ for library in static["imports"]: for function in library["imports"]: - name, address = function["name"], int(function["address"], 16) - yield Import(name), AbsoluteVirtualAddress(address) + addr = int(function["address"], 16) + for name in generate_symbols(function["name"]): + yield Import(name), AbsoluteVirtualAddress(addr) def extract_export_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: From 9cc34cb70f21c1e2253e118a31f3e1f39c3986b6 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 00:19:55 +0100 Subject: [PATCH 61/82] cape/file.py: fix imports ordering and format --- capa/features/extractors/cape/file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index 436e6bd0..92213c8b 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -12,8 +12,8 @@ from typing import Dict, Tuple, Iterator from capa.features.file import Export, Import, Section from capa.features.common import String, Feature from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import ProcessHandle from capa.features.extractors.helpers import generate_symbols +from capa.features.extractors.base_extractor import ProcessHandle logger = logging.getLogger(__name__) From ba63188f276a7ae3d9d40b081107752df91f89c9 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Tue, 20 Jun 2023 10:02:57 +0100 Subject: [PATCH 62/82] cape/file.py: fix bug in call to helpers.generate_symbols() Co-authored-by: Willi Ballenthin --- capa/features/extractors/cape/file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index 92213c8b..dbaf512d 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -40,7 +40,7 @@ def extract_import_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: for library in static["imports"]: for function in library["imports"]: addr = int(function["address"], 16) - for name in generate_symbols(function["name"]): + for name in generate_symbols(library["name"], function["name"]): yield Import(name), AbsoluteVirtualAddress(addr) From a7cf3b5b10410f2b94ede92df338821d38675170 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 10:04:37 +0100 Subject: [PATCH 63/82] features/insn.py: revert added strace-based API feature --- capa/features/insn.py | 54 +++---------------------------------------- 1 file changed, 3 insertions(+), 51 deletions(-) diff --git a/capa/features/insn.py b/capa/features/insn.py index 4f4a78d0..f4be23c8 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -6,7 +6,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. import abc -from typing import Dict, Tuple, Union, Optional +from typing import Union, Optional import capa.helpers from capa.features.common import VALID_FEATURE_ACCESS, Feature @@ -21,56 +21,8 @@ def hex(n: int) -> str: class API(Feature): - def __init__(self, signature: str, description=None): - if signature.isidentifier(): - # api call is in the legacy format - super().__init__(signature, description=description) - self.args: Dict[str, str] = {} - self.ret = "" - else: - # api call is in the strace format and therefore has to be parsed - name, self.args, self.ret = self.parse_signature(signature) - super().__init__(name, description=description) - - # store the original signature for hashing purposes - self.signature = signature - - def __hash__(self): - return hash(self.signature) - - def __eq__(self, other): - if not isinstance(other, API): - return False - - assert isinstance(other, API) - if {} in (self.args, other.args) or "" in (self.ret, other.ret): - # Legacy API feature - return super().__eq__(other) - - # API call with arguments - return super().__eq__(other) and self.args == other.args and self.ret == other.ret - - def parse_signature(self, signature: str) -> Tuple[str, Dict[str, str], str]: - # todo: optimize this method and improve the code quality - import re - - args: Dict[str, str] = {} - ret = "" - - match = re.findall(r"(.+\(.*\)) ?=? ?([^=]*)", signature) - if not match: - return "", {}, "" - if len(match[0]) == 2: - ret = match[0][1] - - match = re.findall(r"(.*)\((.*)\)", match[0][0]) - if len(match[0]) == 2: - args_: Dict[str, str] = (match[0][1] + ", ").split(", ") - map(lambda x: {f"arg{x[0]}": x[1]}, enumerate(args)) - for num, arg in enumerate(args_): - args.update({f"arg {0}": arg}) - - return match[0][0], args, ret + def __init__(self, name: str, description=None): + super().__init__(name, description=description) class _AccessFeature(Feature, abc.ABC): From 41a481252ca5257409290c7bcf52e3154fd3881f Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Tue, 20 Jun 2023 10:08:12 +0100 Subject: [PATCH 64/82] Update CHANGELOG.md Co-authored-by: Moritz --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94153c2a..cb4572ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ ### New Features - Utility script to detect feature overlap between new and existing CAPA rules [#1451](https://github.com/mandiant/capa/issues/1451) [@Aayush-Goel-04](https://github.com/aayush-goel-04) -- Add unit tests for the new CAPE extractor @yelhamer +- Add unit tests for the new CAPE extractor #1563 @yelhamer ### Breaking Changes - Update Metadata type in capa main [#1411](https://github.com/mandiant/capa/issues/1411) [@Aayush-Goel-04](https://github.com/aayush-goel-04) @manasghandat From 48bd04b387786900d5880679a45f23d024d6db3e Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Tue, 20 Jun 2023 10:09:00 +0100 Subject: [PATCH 65/82] tests/fixtures.py: return direct extractor with no intermediate variable Co-authored-by: Moritz --- tests/fixtures.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 87147eb7..e97b961f 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -193,8 +193,7 @@ def get_cape_extractor(path): report = report_file.read() report = json.loads(report) - extractor = CapeExtractor.from_report(report) - return extractor + return CapeExtractor.from_report(report) def extract_global_features(extractor): From ec3366b0e58017c17b68f163a2d767ea8c314f67 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Tue, 20 Jun 2023 10:09:27 +0100 Subject: [PATCH 66/82] Update tests/fixtures.py Co-authored-by: Moritz --- tests/fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index e97b961f..87b9e901 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -545,7 +545,7 @@ def resolve_scope(scope): pspec, _, tspec = scope.partition(",") pspec = pspec.partition("=")[2][1:-1].split(":") assert len(pspec) == 2 - pid, ppid = map(lambda x: int(x), pspec) + pid, ppid = map(int, pspec) tid = int(tspec.partition("=")[2]) def inner_thread(extractor): From 8547277958210d58516e3002e210f85d8a9ddee3 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Tue, 20 Jun 2023 10:10:42 +0100 Subject: [PATCH 67/82] tests/fixtures.py bugfix: remove redundant lambda function Co-authored-by: Moritz --- tests/fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 87b9e901..dc7b308d 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -562,7 +562,7 @@ def resolve_scope(scope): # like `process=(pid:ppid)` pspec = scope.partition("=")[2][1:-1].split(":") assert len(pspec) == 2 - pid, ppid = map(lambda x: int(x), pspec) + pid, ppid = map(int, pspec) def inner_process(extractor): ph = get_process(extractor, ppid, pid) From 4db80e75a4bce78c888dbd7df9adcbcf300ec918 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 10:13:06 +0100 Subject: [PATCH 68/82] add mode and encoding parameters to open() --- tests/fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index dc7b308d..baacabfa 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -189,7 +189,7 @@ def get_cape_extractor(path): from capa.features.extractors.cape.extractor import CapeExtractor - with open(path) as report_file: + with open(path, "r", encoding="utf-8") as report_file: report = report_file.read() report = json.loads(report) From 374fb033c1ee7b9ffb4c97101115f0bd94a2d5eb Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 10:29:52 +0100 Subject: [PATCH 69/82] add support for gzip compressed cape samples, and fix QakBot sample path --- tests/fixtures.py | 81 ++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index baacabfa..5310c085 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -185,13 +185,14 @@ def get_binja_extractor(path): @lru_cache(maxsize=1) def get_cape_extractor(path): + import gzip import json from capa.features.extractors.cape.extractor import CapeExtractor - with open(path, "r", encoding="utf-8") as report_file: - report = report_file.read() - report = json.loads(report) + with gzip.open(path, "r") as compressed_report: + report_json = compressed_report.read() + report = json.loads(report_json) return CapeExtractor.from_report(report) @@ -341,8 +342,10 @@ def get_data_path_by_name(name): return os.path.join(CD, "data", "294b8db1f2702b60fb2e42fdc50c2cee6a5046112da9a5703a548a4fa50477bc.elf_") elif name.startswith("2bf18d"): return os.path.join(CD, "data", "2bf18d0403677378adad9001b1243211.elf_") - elif name.startswith("dynamic_02179f"): - return os.path.join(CD, "data", "dynamic_02179f3ba93663074740b5c0d283bae2.json_") + elif name.startswith("0000a657"): + return os.path.join( + CD, "data/dynamic/cape", "0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82.json.gz" + ) else: raise ValueError(f"unexpected sample fixture: {name}") @@ -403,8 +406,8 @@ def get_sample_md5_by_name(name): return "3db3e55b16a7b1b1afb970d5e77c5d98" elif name.startswith("2bf18d"): return "2bf18d0403677378adad9001b1243211" - elif name.startswith("dynamic_02179f"): - return "dynamic_02179f3ba93663074740b5c0d283bae2.json_" + elif name.startswith("0000a657"): + return "0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82.json.gz" else: raise ValueError(f"unexpected sample fixture: {name}") @@ -601,34 +604,34 @@ def parametrize(params, values, **kwargs): DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( [ # file/string - ("dynamic_02179f", "file", capa.features.common.String("T_Ba?.BcRJa"), True), - ("dynamic_02179f", "file", capa.features.common.String("GetNamedPipeClientSessionId"), True), - ("dynamic_02179f", "file", capa.features.common.String("nope"), False), + ("0000a657", "file", capa.features.common.String("T_Ba?.BcRJa"), True), + ("0000a657", "file", capa.features.common.String("GetNamedPipeClientSessionId"), True), + ("0000a657", "file", capa.features.common.String("nope"), False), # file/sections - ("dynamic_02179f", "file", capa.features.file.Section(".rdata"), True), - ("dynamic_02179f", "file", capa.features.file.Section(".nope"), False), + ("0000a657", "file", capa.features.file.Section(".rdata"), True), + ("0000a657", "file", capa.features.file.Section(".nope"), False), # file/imports - ("dynamic_02179f", "file", capa.features.file.Import("NdrSimpleTypeUnmarshall"), True), - ("dynamic_02179f", "file", capa.features.file.Import("Nope"), False), + ("0000a657", "file", capa.features.file.Import("NdrSimpleTypeUnmarshall"), True), + ("0000a657", "file", capa.features.file.Import("Nope"), False), # file/exports - ("dynamic_02179f", "file", capa.features.file.Export("Nope"), False), + ("0000a657", "file", capa.features.file.Export("Nope"), False), # process/environment variables ( - "dynamic_02179f", + "0000a657", "process=(1180:3052)", capa.features.common.String("C:\\Users\\comp\\AppData\\Roaming\\Microsoft\\Jxoqwnx\\jxoqwn.exe"), True, ), - ("dynamic_02179f", "process=(1180:3052)", capa.features.common.String("nope"), False), + ("0000a657", "process=(1180:3052)", capa.features.common.String("nope"), False), # thread/api calls - ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.API("NtQueryValueKey"), True), - ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.API("GetActiveWindow"), False), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.API("NtQueryValueKey"), True), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.API("GetActiveWindow"), False), # thread/number call argument - ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.Number(0x000000EC), True), - ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.Number(110173), False), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(0x000000EC), True), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(110173), False), # thread/string call argument - # ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), - # ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), + # ("0000a657", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), + # ("0000a657", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), ], # order tests by (file, item) # so that our LRU cache is most effective. @@ -638,34 +641,34 @@ DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( DYNAMIC_FEATURE_COUNT_TESTS = sorted( [ # file/string - ("dynamic_02179f", "file", capa.features.common.String("T_Ba?.BcRJa"), 1), - ("dynamic_02179f", "file", capa.features.common.String("GetNamedPipeClientSessionId"), 1), - ("dynamic_02179f", "file", capa.features.common.String("nope"), 0), + ("0000a657", "file", capa.features.common.String("T_Ba?.BcRJa"), 1), + ("0000a657", "file", capa.features.common.String("GetNamedPipeClientSessionId"), 1), + ("0000a657", "file", capa.features.common.String("nope"), 0), # file/sections - ("dynamic_02179f", "file", capa.features.file.Section(".rdata"), 1), - ("dynamic_02179f", "file", capa.features.file.Section(".nope"), 0), + ("0000a657", "file", capa.features.file.Section(".rdata"), 1), + ("0000a657", "file", capa.features.file.Section(".nope"), 0), # file/imports - ("dynamic_02179f", "file", capa.features.file.Import("NdrSimpleTypeUnmarshall"), 1), - ("dynamic_02179f", "file", capa.features.file.Import("Nope"), 0), + ("0000a657", "file", capa.features.file.Import("NdrSimpleTypeUnmarshall"), 1), + ("0000a657", "file", capa.features.file.Import("Nope"), 0), # file/exports - ("dynamic_02179f", "file", capa.features.file.Export("Nope"), 0), + ("0000a657", "file", capa.features.file.Export("Nope"), 0), # process/environment variables ( - "dynamic_02179f", + "0000a657", "process=(1180:3052)", capa.features.common.String("C:\\Users\\comp\\AppData\\Roaming\\Microsoft\\Jxoqwnx\\jxoqwn.exe"), 1, ), - ("dynamic_02179f", "process=(1180:3052)", capa.features.common.String("nope"), 0), + ("0000a657", "process=(1180:3052)", capa.features.common.String("nope"), 0), # thread/api calls - ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.API("NtQueryValueKey"), 5), - ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.API("GetActiveWindow"), 0), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.API("NtQueryValueKey"), 5), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.API("GetActiveWindow"), 0), # thread/number call argument - ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.Number(0x000000EC), 1), - ("dynamic_02179f", "process=(2852:3052),thread=2804", capa.features.insn.Number(110173), 0), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(0x000000EC), 1), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(110173), 0), # thread/string call argument - # ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), - # ("dynamic_02179f", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), + # ("0000a657", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), + # ("0000a657", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), ], # order tests by (file, item) # so that our LRU cache is most effective. From 61968146724c60d46922c6047cf1202fd4719eac Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 10:51:18 +0100 Subject: [PATCH 70/82] cape/file.py: fix KeyError bug --- capa/features/extractors/cape/file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/cape/file.py b/capa/features/extractors/cape/file.py index dbaf512d..67ca17cc 100644 --- a/capa/features/extractors/cape/file.py +++ b/capa/features/extractors/cape/file.py @@ -40,7 +40,7 @@ def extract_import_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: for library in static["imports"]: for function in library["imports"]: addr = int(function["address"], 16) - for name in generate_symbols(library["name"], function["name"]): + for name in generate_symbols(library["dll"], function["name"]): yield Import(name), AbsoluteVirtualAddress(addr) From cfa1d08e7ef4f6cfc4f96aba98b8186c3c97a418 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 11:28:40 +0100 Subject: [PATCH 71/82] update testfiles submodule to point at dev branch --- .gitmodules | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitmodules b/.gitmodules index 079d13dc..ec880fe0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,4 @@ [submodule "tests/data"] path = tests/data url = ../capa-testfiles.git + branch = dynamic-feature-extractor From 0623a5a8de88085edacb3be328e27d90bfa2a53d Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 12:13:57 +0100 Subject: [PATCH 72/82] point capa-testfiles submodule towards dynamic-feautre-extractor branch --- .gitmodules | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitmodules b/.gitmodules index ec880fe0..7e35b5b1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,3 +5,5 @@ path = tests/data url = ../capa-testfiles.git branch = dynamic-feature-extractor +[submodule "tests/data/"] + branch = dynamic-feature-extractor From 40b2d5f724180f89c0dd510e9b32659a4ce4925f Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 12:40:47 +0100 Subject: [PATCH 73/82] add a remote origin to submodule, and switch to that branch --- tests/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data b/tests/data index a37873c8..f4e21c60 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit a37873c8a571b515f2baaf19bfcfaff5c7ef5342 +Subproject commit f4e21c6037e40607f14d521af370f4eedc2c5eb9 From fa9b920b716f2e75a1bbb30c702f6813796f3663 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 13:17:53 +0100 Subject: [PATCH 74/82] cape/thread.py: do not extract return values, and extract argument values as Strings --- capa/features/extractors/cape/thread.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/capa/features/extractors/cape/thread.py b/capa/features/extractors/cape/thread.py index 3a1217c9..bf3a6b39 100644 --- a/capa/features/extractors/cape/thread.py +++ b/capa/features/extractors/cape/thread.py @@ -42,13 +42,12 @@ def extract_call_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) - caller = int(call["caller"], 16) caller = AbsoluteVirtualAddress(caller) + yield API(call["api"]), caller for arg in call["arguments"]: try: yield Number(int(arg["value"], 16)), caller except ValueError: - continue - yield Number(int(call["return"], 16)), caller - yield API(call["api"]), caller + yield String(arg["value"]), caller def extract_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: From 1532ce1babb3f67e7bf43537dd84955ee23e0d2e Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 13:20:33 +0100 Subject: [PATCH 75/82] add tests for extracting argument values --- tests/fixtures.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 5310c085..0f70a9ab 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -630,8 +630,8 @@ DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(0x000000EC), True), ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(110173), False), # thread/string call argument - # ("0000a657", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), - # ("0000a657", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.common.String("NtQuerySystemInformation"), True), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.common.String("nope"), False), ], # order tests by (file, item) # so that our LRU cache is most effective. @@ -667,8 +667,8 @@ DYNAMIC_FEATURE_COUNT_TESTS = sorted( ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(0x000000EC), 1), ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(110173), 0), # thread/string call argument - # ("0000a657", "process=(2852:3052),thread=500", capa.features.common.String("NtQuerySystemInformation"), True), - # ("0000a657", "process=(2852:3052),thread=500", capa.features.common.String("nope"), False), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.common.String("NtQuerySystemInformation"), True), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.common.String("nope"), False), ], # order tests by (file, item) # so that our LRU cache is most effective. From 31a349b13b438adad66b799e70ef0ea57912ffc3 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 13:21:52 +0100 Subject: [PATCH 76/82] cape feature tests: fix feature count function typo --- tests/test_cape_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cape_features.py b/tests/test_cape_features.py index d7fae8f9..043c0563 100644 --- a/tests/test_cape_features.py +++ b/tests/test_cape_features.py @@ -23,5 +23,5 @@ def test_cape_features(sample, scope, feature, expected): fixtures.DYNAMIC_FEATURE_COUNT_TESTS, indirect=["sample", "scope"], ) -def test_viv_feature_counts(sample, scope, feature, expected): +def test_cape_feature_counts(sample, scope, feature, expected): fixtures.do_test_feature_count(fixtures.get_cape_extractor, sample, scope, feature, expected) From d03ba5394fb32e005631185bbf27bed8f37f9dc8 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 13:26:25 +0100 Subject: [PATCH 77/82] cape/global_.py: add warning messages if architecture/os/format are unknown --- capa/features/extractors/cape/global_.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/capa/features/extractors/cape/global_.py b/capa/features/extractors/cape/global_.py index 70b5d2bf..1582630b 100644 --- a/capa/features/extractors/cape/global_.py +++ b/capa/features/extractors/cape/global_.py @@ -42,6 +42,7 @@ def guess_elf_os(file_output) -> Iterator[Tuple[Feature, Address]]: elif "kNetBSD" in file_output: yield OS("netbsd"), NO_ADDRESS else: + logger.warn("unrecognized OS: %s", file_output) yield OS(OS_ANY), NO_ADDRESS @@ -51,6 +52,7 @@ def extract_arch(static) -> Iterator[Tuple[Feature, Address]]: elif "x86-64" in static["file"]["type"]: yield Arch(ARCH_AMD64), NO_ADDRESS else: + logger.warn("unrecognized Architecture: %s", static["file"]["type"]) yield Arch(ARCH_ANY), NO_ADDRESS @@ -60,7 +62,7 @@ def extract_format(static) -> Iterator[Tuple[Feature, Address]]: elif "ELF" in static["file"]["type"]: yield Format(FORMAT_ELF), NO_ADDRESS else: - logger.debug(f"unknown file format, file command output: {static['file']['type']}") + logger.warn("unknown file format, file command output: %s", static["file"]["type"]) yield Format(FORMAT_UNKNOWN), NO_ADDRESS From 0a4e3008afa0f35ec800df3648f482cecd563d36 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 13:51:16 +0100 Subject: [PATCH 78/82] fixtures.py: update CAPE's feature count and presence tests --- tests/fixtures.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 0f70a9ab..eec1012e 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -630,7 +630,7 @@ DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(0x000000EC), True), ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(110173), False), # thread/string call argument - ("0000a657", "process=(2852:3052),thread=2804", capa.features.common.String("NtQuerySystemInformation"), True), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.common.String("SetThreadUILanguage"), True), ("0000a657", "process=(2852:3052),thread=2804", capa.features.common.String("nope"), False), ], # order tests by (file, item) @@ -657,7 +657,7 @@ DYNAMIC_FEATURE_COUNT_TESTS = sorted( "0000a657", "process=(1180:3052)", capa.features.common.String("C:\\Users\\comp\\AppData\\Roaming\\Microsoft\\Jxoqwnx\\jxoqwn.exe"), - 1, + 2, ), ("0000a657", "process=(1180:3052)", capa.features.common.String("nope"), 0), # thread/api calls @@ -667,8 +667,8 @@ DYNAMIC_FEATURE_COUNT_TESTS = sorted( ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(0x000000EC), 1), ("0000a657", "process=(2852:3052),thread=2804", capa.features.insn.Number(110173), 0), # thread/string call argument - ("0000a657", "process=(2852:3052),thread=2804", capa.features.common.String("NtQuerySystemInformation"), True), - ("0000a657", "process=(2852:3052),thread=2804", capa.features.common.String("nope"), False), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.common.String("SetThreadUILanguage"), 1), + ("0000a657", "process=(2852:3052),thread=2804", capa.features.common.String("nope"), 0), ], # order tests by (file, item) # so that our LRU cache is most effective. From 78a3901c619f4f0535dfe74c77dae5b0b2a0aa1f Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 15:59:22 +0100 Subject: [PATCH 79/82] cape/helpers.py: add a find_process() function for quick-fetching processes from the cape report --- capa/features/extractors/cape/helpers.py | 28 ++++++++++++++++++++++++ capa/features/extractors/cape/process.py | 10 ++++----- capa/features/extractors/cape/thread.py | 6 ++--- 3 files changed, 35 insertions(+), 9 deletions(-) create mode 100644 capa/features/extractors/cape/helpers.py diff --git a/capa/features/extractors/cape/helpers.py b/capa/features/extractors/cape/helpers.py new file mode 100644 index 00000000..fad9be0e --- /dev/null +++ b/capa/features/extractors/cape/helpers.py @@ -0,0 +1,28 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +from typing import Any, Dict, List + +from capa.features.extractors.base_extractor import ProcessHandle + + +def find_process(processes: List[Dict[str, Any]], ph: ProcessHandle) -> Dict[str, Any]: + """ + find a specific process identified by a process handler. + + args: + processes: a list of processes extracted by CAPE + ph: handle of the sought process + + return: + a CAPE-defined dictionary for the sought process' information + """ + + for process in processes: + if ph.pid == process["process_id"] and ph.inner["ppid"] == process["parent_id"]: + return process + return {} diff --git a/capa/features/extractors/cape/process.py b/capa/features/extractors/cape/process.py index 8139e4a3..6282d189 100644 --- a/capa/features/extractors/cape/process.py +++ b/capa/features/extractors/cape/process.py @@ -24,9 +24,8 @@ def get_threads(behavior: Dict, ph: ProcessHandle) -> Iterator[ThreadHandle]: get a thread's child processes """ - for process in behavior["processes"]: - if ph.pid == process["process_id"] and ph.inner["ppid"] == process["parent_id"]: - threads: List = process["threads"] + process = capa.features.extractors.cape.helpers.find_process(behavior["processes"], ph) + threads: List = process["threads"] for thread in threads: yield ThreadHandle(int(thread), inner={}) @@ -37,9 +36,8 @@ def extract_environ_strings(behavior: Dict, ph: ProcessHandle) -> Iterator[Tuple extract strings from a process' provided environment variables. """ - for process in behavior["processes"]: - if ph.pid == process["process_id"] and ph.inner["ppid"] == process["parent_id"]: - environ: Dict[str, str] = process["environ"] + process = capa.features.extractors.cape.helpers.find_process(behavior["processes"], ph) + environ: Dict[str, str] = process["environ"] if not environ: return diff --git a/capa/features/extractors/cape/thread.py b/capa/features/extractors/cape/thread.py index bf3a6b39..9a1d7ed6 100644 --- a/capa/features/extractors/cape/thread.py +++ b/capa/features/extractors/cape/thread.py @@ -9,6 +9,7 @@ import logging from typing import Any, Dict, List, Tuple, Iterator +import capa.features.extractors.cape.helpers from capa.features.insn import API, Number from capa.features.common import String, Feature from capa.features.address import Address, AbsoluteVirtualAddress @@ -31,9 +32,8 @@ def extract_call_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) - Feature, address; where Feature is either: API, Number, or String. """ - for process in behavior["processes"]: - if ph.pid == process["process_id"] and ph.inner["ppid"] == process["parent_id"]: - calls: List[Dict] = process["calls"] + process = capa.features.extractors.cape.helpers.find_process(behavior["processes"], ph) + calls: List[Dict[str, Any]] = process["calls"] tid = str(th.tid) for call in calls: From 0502bfd95d94fb723f365ec0d02dc4652860c7de Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 20:24:38 +0100 Subject: [PATCH 80/82] remove cape report from get_md5_hash() function --- tests/fixtures.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index eec1012e..238d122b 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -406,8 +406,6 @@ def get_sample_md5_by_name(name): return "3db3e55b16a7b1b1afb970d5e77c5d98" elif name.startswith("2bf18d"): return "2bf18d0403677378adad9001b1243211" - elif name.startswith("0000a657"): - return "0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82.json.gz" else: raise ValueError(f"unexpected sample fixture: {name}") From f29db693c8a3fbc826ee32e75877416e697b5e70 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Jun 2023 20:25:19 +0100 Subject: [PATCH 81/82] fix git submodules error --- .gitmodules | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 7e35b5b1..ec880fe0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,5 +5,3 @@ path = tests/data url = ../capa-testfiles.git branch = dynamic-feature-extractor -[submodule "tests/data/"] - branch = dynamic-feature-extractor From 6712801b01ff952d5c720d7edd5eee88adff81ad Mon Sep 17 00:00:00 2001 From: Yacine Elhamer <16624109+yelhamer@users.noreply.github.com> Date: Tue, 20 Jun 2023 20:30:06 +0100 Subject: [PATCH 82/82] tests/fixtures.py: update path forming for the cape sample Co-authored-by: Willi Ballenthin --- tests/fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 238d122b..19acb7ff 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -344,7 +344,7 @@ def get_data_path_by_name(name): return os.path.join(CD, "data", "2bf18d0403677378adad9001b1243211.elf_") elif name.startswith("0000a657"): return os.path.join( - CD, "data/dynamic/cape", "0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82.json.gz" + CD, "data", "dynamic", "cape", "0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82.json.gz" ) else: raise ValueError(f"unexpected sample fixture: {name}")