Merge branch 'dynamic-feature-extraction' into static-extractor

This commit is contained in:
Willi Ballenthin
2023-06-27 09:42:16 +02:00
committed by GitHub
6 changed files with 150 additions and 66 deletions

View File

@@ -7,6 +7,7 @@
- Utility script to detect feature overlap between new and existing CAPA rules [#1451](https://github.com/mandiant/capa/issues/1451) [@Aayush-Goel-04](https://github.com/aayush-goel-04) - Utility script to detect feature overlap between new and existing CAPA rules [#1451](https://github.com/mandiant/capa/issues/1451) [@Aayush-Goel-04](https://github.com/aayush-goel-04)
- Add a dynamic feature extractor for the CAPE sandbox @yelhamer [#1535](https://github.com/mandiant/capa/issues/1535) - Add a dynamic feature extractor for the CAPE sandbox @yelhamer [#1535](https://github.com/mandiant/capa/issues/1535)
- Add unit tests for the new CAPE extractor #1563 @yelhamer - Add unit tests for the new CAPE extractor #1563 @yelhamer
- Add a CAPE file format and CAPE-based dynamic feature extraction to scripts/show-features.py #1566 @yelhamer
### Breaking Changes ### Breaking Changes
- Update Metadata type in capa main [#1411](https://github.com/mandiant/capa/issues/1411) [@Aayush-Goel-04](https://github.com/aayush-goel-04) @manasghandat - Update Metadata type in capa main [#1411](https://github.com/mandiant/capa/issues/1411) [@Aayush-Goel-04](https://github.com/aayush-goel-04) @manasghandat

View File

@@ -449,6 +449,8 @@ VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET)
FORMAT_AUTO = "auto" FORMAT_AUTO = "auto"
FORMAT_SC32 = "sc32" FORMAT_SC32 = "sc32"
FORMAT_SC64 = "sc64" FORMAT_SC64 = "sc64"
FORMAT_CAPE = "cape"
DYNAMIC_FORMATS = (FORMAT_CAPE,)
FORMAT_FREEZE = "freeze" FORMAT_FREEZE = "freeze"
FORMAT_RESULT = "result" FORMAT_RESULT = "result"
FORMAT_UNKNOWN = "unknown" FORMAT_UNKNOWN = "unknown"

View File

@@ -1,4 +1,5 @@
import io import io
import json
import logging import logging
import binascii import binascii
import contextlib import contextlib
@@ -18,6 +19,7 @@ from capa.features.common import (
FORMAT_PE, FORMAT_PE,
FORMAT_ELF, FORMAT_ELF,
OS_WINDOWS, OS_WINDOWS,
FORMAT_CAPE,
FORMAT_FREEZE, FORMAT_FREEZE,
FORMAT_RESULT, FORMAT_RESULT,
Arch, Arch,

View File

@@ -6,6 +6,7 @@
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License. # See the License for the specific language governing permissions and limitations under the License.
import os import os
import json
import inspect import inspect
import logging import logging
import contextlib import contextlib
@@ -14,10 +15,11 @@ from typing import NoReturn
import tqdm import tqdm
from capa.exceptions import UnsupportedFormatError from capa.exceptions import UnsupportedFormatError
from capa.features.common import FORMAT_PE, FORMAT_SC32, FORMAT_SC64, FORMAT_DOTNET, FORMAT_UNKNOWN, Format from capa.features.common import FORMAT_PE, FORMAT_CAPE, FORMAT_SC32, FORMAT_SC64, FORMAT_DOTNET, FORMAT_UNKNOWN, Format
EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32") EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32")
EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64") EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64")
EXTENSIONS_DYNAMIC = ("json", "json_")
EXTENSIONS_ELF = "elf_" EXTENSIONS_ELF = "elf_"
logger = logging.getLogger("capa") logger = logging.getLogger("capa")
@@ -52,14 +54,25 @@ def assert_never(value) -> NoReturn:
assert False, f"Unhandled value: {value} ({type(value).__name__})" assert False, f"Unhandled value: {value} ({type(value).__name__})"
def get_format_from_extension(sample: str) -> str: def get_format_from_report(sample: str) -> str:
if sample.endswith(EXTENSIONS_SHELLCODE_32): with open(sample, "rb") as f:
return FORMAT_SC32 report = json.load(f)
elif sample.endswith(EXTENSIONS_SHELLCODE_64): if "CAPE" in report.keys():
return FORMAT_SC64 return FORMAT_CAPE
return FORMAT_UNKNOWN return FORMAT_UNKNOWN
def get_format_from_extension(sample: str) -> str:
format_ = FORMAT_UNKNOWN
if sample.endswith(EXTENSIONS_SHELLCODE_32):
format_ = FORMAT_SC32
elif sample.endswith(EXTENSIONS_SHELLCODE_64):
format_ = FORMAT_SC64
elif sample.endswith(EXTENSIONS_DYNAMIC):
format_ = get_format_from_report(sample)
return format_
def get_auto_format(path: str) -> str: def get_auto_format(path: str) -> str:
format_ = get_format(path) format_ = get_format(path)
if format_ == FORMAT_UNKNOWN: if format_ == FORMAT_UNKNOWN:

View File

@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations
""" """
import os import os
import sys import sys
import json
import time import time
import hashlib import hashlib
import logging import logging
@@ -20,7 +21,7 @@ import textwrap
import itertools import itertools
import contextlib import contextlib
import collections import collections
from typing import Any, Dict, List, Tuple, Callable, cast from typing import Any, Dict, List, Tuple, Callable, cast, Union
import halo import halo
import tqdm import tqdm
@@ -49,6 +50,7 @@ import capa.features.extractors.dnfile_
import capa.features.extractors.elffile import capa.features.extractors.elffile
import capa.features.extractors.dotnetfile import capa.features.extractors.dotnetfile
import capa.features.extractors.base_extractor import capa.features.extractors.base_extractor
import capa.features.extractors.cape.extractor
from capa.rules import Rule, Scope, RuleSet from capa.rules import Rule, Scope, RuleSet
from capa.engine import FeatureSet, MatchResults from capa.engine import FeatureSet, MatchResults
from capa.helpers import ( from capa.helpers import (
@@ -69,6 +71,7 @@ from capa.features.common import (
FORMAT_ELF, FORMAT_ELF,
OS_WINDOWS, OS_WINDOWS,
FORMAT_AUTO, FORMAT_AUTO,
FORMAT_CAPE,
FORMAT_SC32, FORMAT_SC32,
FORMAT_SC64, FORMAT_SC64,
FORMAT_DOTNET, FORMAT_DOTNET,
@@ -543,7 +546,8 @@ def get_extractor(
UnsupportedArchError UnsupportedArchError
UnsupportedOSError UnsupportedOSError
""" """
if format_ not in (FORMAT_SC32, FORMAT_SC64):
if format_ not in (FORMAT_SC32, FORMAT_SC64, FORMAT_CAPE):
if not is_supported_format(path): if not is_supported_format(path):
raise UnsupportedFormatError() raise UnsupportedFormatError()
@@ -553,7 +557,14 @@ def get_extractor(
if os_ == OS_AUTO and not is_supported_os(path): if os_ == OS_AUTO and not is_supported_os(path):
raise UnsupportedOSError() raise UnsupportedOSError()
if format_ == FORMAT_DOTNET: if format_ == FORMAT_CAPE:
import capa.features.extractors.cape.extractor
with open(path, "rb") as f:
report = json.load(f)
return capa.features.extractors.cape.extractor.CapeExtractor.from_report(report)
elif format_ == FORMAT_DOTNET:
import capa.features.extractors.dnfile.extractor import capa.features.extractors.dnfile.extractor
return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path) return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path)
@@ -618,6 +629,11 @@ def get_file_extractors(sample: str, format_: str) -> List[FeatureExtractor]:
elif format_ == capa.features.extractors.common.FORMAT_ELF: elif format_ == capa.features.extractors.common.FORMAT_ELF:
file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(sample)) file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(sample))
elif format_ == FORMAT_CAPE:
with open(sample, "rb") as f:
report = json.load(f)
file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
return file_extractors return file_extractors
@@ -924,6 +940,7 @@ def install_common_args(parser, wanted=None):
(FORMAT_ELF, "Executable and Linkable Format"), (FORMAT_ELF, "Executable and Linkable Format"),
(FORMAT_SC32, "32-bit shellcode"), (FORMAT_SC32, "32-bit shellcode"),
(FORMAT_SC64, "64-bit shellcode"), (FORMAT_SC64, "64-bit shellcode"),
(FORMAT_CAPE, "CAPE sandbox report"),
(FORMAT_FREEZE, "features previously frozen by capa"), (FORMAT_FREEZE, "features previously frozen by capa"),
] ]
format_help = ", ".join([f"{f[0]}: {f[1]}" for f in formats]) format_help = ", ".join([f"{f[0]}: {f[1]}" for f in formats])

View File

@@ -69,6 +69,7 @@ import sys
import logging import logging
import os.path import os.path
import argparse import argparse
from typing import cast
import capa.main import capa.main
import capa.rules import capa.rules
@@ -77,11 +78,12 @@ import capa.helpers
import capa.features import capa.features
import capa.exceptions import capa.exceptions
import capa.render.verbose as v import capa.render.verbose as v
import capa.features.common
import capa.features.freeze import capa.features.freeze
import capa.features.address import capa.features.address
from capa.helpers import log_unsupported_runtime_error from capa.helpers import get_auto_format, log_unsupported_runtime_error
from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor from capa.features.common import FORMAT_AUTO, FORMAT_FREEZE, DYNAMIC_FORMATS, is_global_feature
from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor
logger = logging.getLogger("capa.show-features") logger = logging.getLogger("capa.show-features")
@@ -98,6 +100,7 @@ def main(argv=None):
capa.main.install_common_args(parser, wanted={"format", "os", "sample", "signatures", "backend"}) capa.main.install_common_args(parser, wanted={"format", "os", "sample", "signatures", "backend"})
parser.add_argument("-F", "--function", type=str, help="Show features for specific function") parser.add_argument("-F", "--function", type=str, help="Show features for specific function")
parser.add_argument("-P", "--process", type=str, help="Show features for specific process name")
args = parser.parse_args(args=argv) args = parser.parse_args(args=argv)
capa.main.handle_common_args(args) capa.main.handle_common_args(args)
@@ -113,16 +116,17 @@ def main(argv=None):
logger.error("%s", str(e)) logger.error("%s", str(e))
return -1 return -1
if (args.format == "freeze") or ( format_ = args.format if args.format != FORMAT_AUTO else get_auto_format(args.sample)
args.format == capa.features.common.FORMAT_AUTO and capa.features.freeze.is_freeze(taste) if format_ == FORMAT_FREEZE:
): # this should be moved above the previous if clause after implementing
# feature freeze for the dynamic analysis flavor
with open(args.sample, "rb") as f: with open(args.sample, "rb") as f:
extractor: FeatureExtractor = capa.features.freeze.load(f.read()) extractor: FeatureExtractor = capa.features.freeze.load(f.read())
else: else:
should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None) should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
try: try:
extractor = capa.main.get_extractor( extractor = capa.main.get_extractor(
args.sample, args.format, args.os, args.backend, sig_paths, should_save_workspace args.sample, format_, args.os, args.backend, sig_paths, should_save_workspace
) )
except capa.exceptions.UnsupportedFormatError: except capa.exceptions.UnsupportedFormatError:
capa.helpers.log_unsupported_format_error() capa.helpers.log_unsupported_format_error()
@@ -131,7 +135,17 @@ def main(argv=None):
log_unsupported_runtime_error() log_unsupported_runtime_error()
return -1 return -1
assert isinstance(extractor, StaticFeatureExtractor) if format_ in DYNAMIC_FORMATS:
assert isinstance(extractor, DynamicFeatureExtractor)
print_dynamic_analysis(extractor, args)
else:
assert isinstance(extractor, StaticFeatureExtractor)
print_static_analysis(extractor, args)
return 0
def print_static_analysis(extractor: StaticFeatureExtractor, args):
for feature, addr in extractor.extract_global_features(): for feature, addr in extractor.extract_global_features():
print(f"global: {format_address(addr)}: {feature}") print(f"global: {format_address(addr)}: {feature}")
@@ -156,9 +170,90 @@ def main(argv=None):
print(f"{args.function} not a function") print(f"{args.function} not a function")
return -1 return -1
print_features(function_handles, extractor) print_static_features(function_handles, extractor)
return 0
def print_dynamic_analysis(extractor: DynamicFeatureExtractor, args):
for feature, addr in extractor.extract_global_features():
print(f"global: {format_address(addr)}: {feature}")
if not args.process:
for feature, addr in extractor.extract_file_features():
print(f"file: {format_address(addr)}: {feature}")
process_handles = tuple(extractor.get_processes())
if args.process:
process_handles = tuple(filter(lambda ph: ph.inner["name"] == args.process, process_handles))
if args.process not in [ph.inner["name"] for ph in args.process]:
print(f"{args.process} not a process")
return -1
print_dynamic_features(process_handles, extractor)
def print_static_features(functions, extractor: StaticFeatureExtractor):
for f in functions:
if extractor.is_library_function(f.address):
function_name = extractor.get_function_name(f.address)
logger.debug("skipping library function %s (%s)", format_address(f.address), function_name)
continue
print(f"func: {format_address(f.address)}")
for feature, addr in extractor.extract_function_features(f):
if is_global_feature(feature):
continue
if f.address != addr:
print(f" func: {format_address(f.address)}: {feature} -> {format_address(addr)}")
else:
print(f" func: {format_address(f.address)}: {feature}")
for bb in extractor.get_basic_blocks(f):
for feature, addr in extractor.extract_basic_block_features(f, bb):
if is_global_feature(feature):
continue
if bb.address != addr:
print(f" bb: {format_address(bb.address)}: {feature} -> {format_address(addr)}")
else:
print(f" bb: {format_address(bb.address)}: {feature}")
for insn in extractor.get_instructions(f, bb):
for feature, addr in extractor.extract_insn_features(f, bb, insn):
if is_global_feature(feature):
continue
try:
if insn.address != addr:
print(
f" insn: {format_address(f.address)}: {format_address(insn.address)}: {feature} -> {format_address(addr)}"
)
else:
print(f" insn: {format_address(insn.address)}: {feature}")
except UnicodeEncodeError:
# may be an issue while piping to less and encountering non-ascii characters
continue
def print_dynamic_features(processes, extractor: DynamicFeatureExtractor):
for p in processes:
print(f"proc: {p.inner['name']} (ppid={p.inner['ppid']}, pid={p.pid})")
for feature, addr in extractor.extract_process_features(p):
if is_global_feature(feature):
continue
print(f" proc: {p.inner['name']}: {feature}")
for t in extractor.get_threads(p):
for feature, addr in extractor.extract_thread_features(p, t):
if is_global_feature(feature):
continue
print(f" thread: {t.tid}: {feature}")
def ida_main(): def ida_main():
@@ -185,57 +280,11 @@ def ida_main():
print(f"{hex(function)} not a function") print(f"{hex(function)} not a function")
return -1 return -1
print_features(function_handles, extractor) print_static_features(function_handles, extractor)
return 0 return 0
def print_features(functions, extractor: StaticFeatureExtractor):
for f in functions:
if extractor.is_library_function(f.address):
function_name = extractor.get_function_name(f.address)
logger.debug("skipping library function %s (%s)", format_address(f.address), function_name)
continue
print(f"func: {format_address(f.address)}")
for feature, addr in extractor.extract_function_features(f):
if capa.features.common.is_global_feature(feature):
continue
if f.address != addr:
print(f" func: {format_address(f.address)}: {feature} -> {format_address(addr)}")
else:
print(f" func: {format_address(f.address)}: {feature}")
for bb in extractor.get_basic_blocks(f):
for feature, addr in extractor.extract_basic_block_features(f, bb):
if capa.features.common.is_global_feature(feature):
continue
if bb.address != addr:
print(f" bb: {format_address(bb.address)}: {feature} -> {format_address(addr)}")
else:
print(f" bb: {format_address(bb.address)}: {feature}")
for insn in extractor.get_instructions(f, bb):
for feature, addr in extractor.extract_insn_features(f, bb, insn):
if capa.features.common.is_global_feature(feature):
continue
try:
if insn.address != addr:
print(
f" insn: {format_address(f.address)}: {format_address(insn.address)}: {feature} -> {format_address(addr)}"
)
else:
print(f" insn: {format_address(insn.address)}: {feature}")
except UnicodeEncodeError:
# may be an issue while piping to less and encountering non-ascii characters
continue
if __name__ == "__main__": if __name__ == "__main__":
if capa.main.is_runtime_ida(): if capa.main.is_runtime_ida():
ida_main() ida_main()