import source files, forgetting about 938 prior commits

This commit is contained in:
William Ballenthin
2020-06-18 09:13:01 -06:00
parent f2d795090c
commit add3537447
65 changed files with 10322 additions and 0 deletions

71
scripts/testbed/README.md Normal file
View File

@@ -0,0 +1,71 @@
# Testbed
Goal of the testbed is to support the development of new `capa` rules. Scripts allow to test rules against a large sample set and to batch process samples, e.g. to freeze features or to generate other meta data used for testing.
The testbed contains malicious and benign files. Data sources are:
- Microsoft EXE and DLL files from `C:\Windows\System32`, `C:\Windows\SysWOW64`, etc.
- samples analyzed and annotated by FLARE analysts during malware analysis
Samples containing the keyword `slow` in their path indicate a longer test run time (>20 seconds) and can be ignored via the `-f` argument.
Running a rule against a large set of executable programs helps to quickly determine on which functions/samples a rule hits. This helps to identify:
- true positives: hits on expected functions
- false positives: hits on unexpected functions, for example
- if a rule is to generic or
- if a rule hits on a capability present in many (benign) samples
To provide additional context the testbed contains function names from the following data sources:
- benign files: function names from Microsoft's PDB information
- malicious files: function names provided by FLARE analysts and obtained from
the LabelMaker 2000 (LM2k) annotations repository
For each test sample the testbed contains the following files:
- a `.frz` file storing the extracted `capa` features
- `capa`'s serialized features, via `capa.features.freeze`
- a `.fnames` file mapping function addresses to function names
- JSON file that maps fvas to function names or
- CSV file with entries `idbmd5;md5;fva;fname`
- (optional) the binary file with extension `.exe_`, `.dll_`, or `.mal_`
## Scripts
### `run_rule_on_testbed.py`
Run a `capa` rule file against the testbed (frozen features in a directory).
Meant to be run on directories that contain `.frz` and `.fnames` files.
Example usage:
run_rule_on_testbed.py <testbed dir>
run_rule_on_testbed.py samples
With the `-s <image_path>` argument, the script exports images of function graphs to the provided path.
Converting the images requires `graphviz`. See https://graphviz.gitlab.io/about/; get Python interface via `pip install graphviz`.
## Helper Scripts
### `freeze_features.py`
Use `freeze_features.py` to freeze `capa` features of a file or of files in a directory.
Example usage:
freeze_features.py <testbed dir>
freeze_features.py samples
### `start_ida_dump_fnames.py`
Start IDA Pro in autonomous mode to dump JSON file of function names `{fva: fname}`. Processes a single file or a directory.
This script uses `_dump_fnames.py` to dump the JSON file of functions names and is meant to be run on benign files with PDB information. IDA should apply function names from the PDB information automatically.
Example usage:
start_ida_dump_fnames.py <candidate files dir>
start_ida_dump_fnames.py samples\benign
### `start_ida_export_fimages.py`
Start IDA Pro in autonomous mode to export images of function graphs.
`run_rule_on_testbed.py` integrates the export mechanism (`-s` option)
This script uses `_export_fimages.py` to export DOT files of function graphs and then converts them to PNG images using `graphviz`.
Example usage:
start_ida_export_fimages.py <target file> <output dir> -f <function list>
start_ida_export_fimages.py test.exe imgs -f 0x401000,0x402F90

View File

@@ -0,0 +1,2 @@
FNAMES_EXTENSION = '.fnames'
FREEZE_EXTENSION = '.frz'

View File

@@ -0,0 +1,46 @@
'''
IDAPython script to dump JSON file of functions names { fva: fname }.
Meant to be run on benign files with PDB information. IDA should apply function names from the PDB files automatically.
Can also be run on annotated IDA database files.
Example usage (via IDA autonomous mode):
ida.exe -A -S_dump_fnames.py "<output path>" <sample_path>
'''
import json
import idc
import idautils
def main():
if len(idc.ARGV) != 2:
# requires output file path argument
idc.qexit(-1)
# wait for auto-analysis to finish
idc.auto_wait()
INF_SHORT_DN_ATTR = idc.get_inf_attr(idc.INF_SHORT_DN) # short form of demangled names
fnames = {}
for f in idautils.Functions():
fname = idc.get_name(f)
if fname.startswith("sub_"):
continue
name_demangled = idc.demangle_name(fname, INF_SHORT_DN_ATTR)
if name_demangled:
fname = name_demangled
fnames[f] = fname
with open(idc.ARGV[1], "w") as f:
json.dump(fnames, f)
# exit IDA
idc.qexit(0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,44 @@
'''
IDAPython script to export DOT files of function graphs.
Example usage (via IDA autonomous mode):
ida.exe -A -S_export_fimages.py "<output dir>" <fva1> [<fva2> ...] <sample_path>
'''
import os
import idc
import idaapi
import ida_gdl
def main():
if len(idc.ARGV) < 3:
# requires output directory and function VAs argument(s)
idc.qexit(-1)
# wait for auto-analysis to finish
idc.auto_wait()
out_dir = idc.ARGV[1]
fvas = [int(fva, 0x10) for fva in idc.ARGV[2:]]
idb_name = os.path.split(idc.get_idb_path())[-1]
for fva in fvas:
fstart = idc.get_func_attr(fva, idc.FUNCATTR_START)
name = '%s_0x%x' % (idb_name.replace('.', '_'), fstart)
out_path = os.path.join(out_dir, name)
fname = idc.get_name(fstart)
if not ida_gdl.gen_flow_graph(out_path, '%s (0x%x)' % (fname, fstart), idaapi.get_func(fstart), 0, 0,
ida_gdl.CHART_GEN_DOT | ida_gdl.CHART_PRINT_NAMES):
print 'IDA error generating flow graph'
# TODO add label to DOT file, see https://stackoverflow.com/a/6452088/10548020
# TODO highlight where rule matched
# exit IDA
idc.qexit(0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,102 @@
'''
Freeze capa features.
Example usage:
freeze_features.py <test files dir>
freeze_features.py samples\benign
'''
import os
import sys
import time
import logging
import argparse
from scripts.testbed import FREEZE_EXTENSION
from capa.features.freeze import main as freeze_features
# only process files with these extensions
TARGET_EXTENSIONS = [
'.mal_',
'.exe_',
'.dll_',
'.sys_'
]
logger = logging.getLogger('check_rule')
def freeze(input_path, reprocess):
if not os.path.exists(input_path):
raise IOError('%s does not exist or cannot be accessed' % input_path)
if os.path.isfile(input_path):
outfile = '%s%s' % (input_path, FREEZE_EXTENSION)
freeze_file(input_path, outfile, reprocess)
elif os.path.isdir(input_path):
logger.info('freezing features of %s files in %s', '|'.join(TARGET_EXTENSIONS), input_path)
for root, dirs, files in os.walk(input_path):
for file in files:
if not os.path.splitext(file)[1] in TARGET_EXTENSIONS:
logger.debug('skipping non-target file: %s', file)
continue
path = os.path.join(root, file)
outfile = '%s%s' % (path, FREEZE_EXTENSION)
freeze_file(path, outfile, reprocess)
def freeze_file(path, output, reprocess=False):
logger.info('freezing features of %s', path)
if os.path.exists(output) and not reprocess:
logger.info('%s already exists, provide -r argument to reprocess', output)
return
try:
freeze_features([path, output]) # args: sample, output
except Exception as e:
logger.error('could not freeze features for %s: %s', path, str(e))
def main(argv=None):
if argv is None:
argv = sys.argv[1:]
parser = argparse.ArgumentParser(description="Freeze capa features of a file or of files in a directory")
parser.add_argument("file_path", type=str,
help="Path to file or directory to analyze")
parser.add_argument("-r", "--reprocess", action="store_true", default=False,
help="Overwrite existing analysis")
parser.add_argument("-v", "--verbose", action="store_true",
help="Enable verbose output")
parser.add_argument("-q", "--quiet", action="store_true",
help="Disable all output but errors")
args = parser.parse_args(args=argv)
if args.quiet:
logging.basicConfig(level=logging.ERROR)
logging.getLogger().setLevel(logging.ERROR)
elif args.verbose:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)
time0 = time.time()
try:
freeze(args.file_path, args.reprocess)
except IOError as e:
logger.error('%s', str(e))
return -1
logger.info('freezing features took %d seconds', time.time() - time0)
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,297 @@
'''
Run a capa rule file against the testbed (frozen features in a directory).
Example usage:
run_rule_on_testbed.py <path to rules> <rule name> <testbed dir>
run_rule_on_testbed.py ..\\rules "create pipe" samples
'''
import os
import sys
import json
import time
import logging
from collections import defaultdict
import argparse
import capa.main
import capa.rules
import capa.features.freeze
from scripts.testbed import FNAMES_EXTENSION, FREEZE_EXTENSION
from start_ida_export_fimages import export_fimages
logger = logging.getLogger(__name__)
# sorry globals...
file_count = 0
file_hits = 0
mal_hits = 0
other_hits = 0
function_hits = 0
errors = 0
function_names = set([])
CATEGORY = {
'malicious': 'MAL',
'benign': 'BEN',
}
def check_rule(path, rules, rule_name, only_matching, save_image, verbose):
global file_count, file_hits, mal_hits, other_hits, function_hits, errors
try:
capabilities = get_capabilities(path, rules)
except (ValueError, KeyError) as e:
logger.error('cannot load %s due to %s: %s', path, type(e).__name__, str(e))
errors += 1
return
file_count += 1
hits = get_function_hits(capabilities, rule_name)
if hits == 0:
if not only_matching:
render_no_hit(path)
else:
print('[x] rule matches %d function(s) in %s (%s)' % (hits, path, get_category(path)))
file_hits += 1
function_hits += hits
if get_category(path) == 'MAL':
mal_hits += 1
else:
other_hits += 1
if verbose:
render_hit_verbose(capabilities, path, verbose > 1)
if save_image:
fvas = ['0x%x' % fva for fva in get_hit_fvas(capabilities)]
file_path = get_idb_or_sample_path(path)
if file_path:
if not export_fimages(file_path, save_image, fvas):
logger.warning('exporting images failed')
else:
logger.warning('could not get IDB or sample path')
def get_idb_or_sample_path(path):
exts = ['.idb', '.i64', '.exe_', '.dll_', '.mal_']
roots = [os.path.splitext(path)[0], path]
for e in exts:
for r in roots:
p = '%s%s' % (r, e)
if os.path.exists(p):
return p
return None
def get_capabilities(path, rules):
logger.debug('matching rules in %s', path)
with open(path, 'rb') as f:
extractor = capa.features.freeze.load(f.read())
return capa.main.find_capabilities(rules, extractor, disable_progress=True)
def get_function_hits(capabilities, rule_name):
return len(capabilities.get(rule_name, []))
def get_category(path):
for c in CATEGORY:
if c in path:
return CATEGORY[c]
return 'UNK'
def render_no_hit(path):
print('[ ] no match in %s (%s)' % (path, get_category(path)))
def render_hit_verbose(capabilities, path, vverbose):
try:
fnames = load_fnames(path)
except IOError as e:
logger.error('%s', str(e))
fnames = None
for rule, ress in capabilities.items():
for (fva, res) in sorted(ress, key=lambda p: p[0]):
if fnames and fva in fnames:
fname = fnames[fva]
function_names.add(fname)
else:
fname = '<name unknown>'
print(' - function 0x%x (%s)' % (fva, fname))
if vverbose:
capa.main.render_result(res, indent=' ')
def get_hit_fvas(capabilities):
fvas = []
for rule, ress in capabilities.items():
for (fva, res) in sorted(ress, key=lambda p: p[0]):
fvas.append(fva)
return fvas
def load_fnames(path):
fnames_path = path.replace(FREEZE_EXTENSION, FNAMES_EXTENSION)
if not os.path.exists(fnames_path):
raise IOError('%s does not exist' % fnames_path)
logger.debug('fnames path: %s', fnames_path)
try:
# json file with format { fva: fname }
fnames = load_json(fnames_path)
logger.debug('loaded JSON file')
except TypeError:
# csv file with format idbmd5;md5;fva;fname
fnames = load_csv(fnames_path)
logger.debug('loaded CSV file')
fnames = convert_keys_to_int(fnames)
logger.debug('read %d function names' % len(fnames))
return fnames
def load_json(path):
with open(path, 'r') as f:
try:
funcs = json.load(f)
except ValueError as e:
logger.debug('not a JSON file, %s', str(e))
raise TypeError
return funcs
def load_csv(path):
funcs = defaultdict(str)
with open(path, 'r') as f:
data = f.read().splitlines()
for line in data:
try:
idbmd5, md5, fva, name = line.split(':', 3)
except ValueError as e:
logger.warning('%s: "%s"', str(e), line)
funcs[fva] = name
return funcs
def convert_keys_to_int(funcs_in):
funcs = {}
for k, v in funcs_in.iteritems():
try:
k = int(k)
except ValueError:
k = int(k, 0x10)
funcs[k] = v
return funcs
def print_summary(verbose, start_time):
global file_count, file_hits, function_hits, errors
print('\n[SUMMARY]')
m, s = divmod(time.time() - start_time, 60)
logger.info('ran for %d:%02d minutes', m, s)
ratio = ' (%d%%)' % ((float(file_hits) / file_count) * 100) if file_count else ''
print('matched %d function(s) in %d/%d%s sample(s), encountered %d error(s)' % (
function_hits, file_hits, file_count, ratio, errors))
print('%d hits on (MAL) files; %d hits on other files' % (mal_hits, other_hits))
if verbose:
if len(function_names) > 0:
print('matched function names (unique):')
for fname in function_names:
print ' - %s' % fname
def main(argv=None):
if argv is None:
argv = sys.argv[1:]
parser = argparse.ArgumentParser(description="Run capa rule file against frozen features in a directory")
parser.add_argument("rules", type=str,
help="Path to directory containing rules")
parser.add_argument("rule_name", type=str,
help="Name of rule to test")
parser.add_argument("frozen_path", type=str,
help="Path to frozen feature file or directory")
parser.add_argument("-f", "--fast", action="store_true",
help="Don't test slow files")
parser.add_argument("-o", "--only_matching", action="store_true",
help="Print only if rule matches")
parser.add_argument("-s", "--save_image", action="store",
help="Directory to save exported images of function graphs")
parser.add_argument("-v", "--verbose", action="count", default=0,
help="Increase output verbosity")
parser.add_argument("-q", "--quiet", action="store_true",
help="Disable all output but errors")
args = parser.parse_args(args=argv)
if args.quiet:
logging.basicConfig(level=logging.ERROR)
logging.getLogger().setLevel(logging.ERROR)
elif args.verbose:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)
if not os.path.isdir(args.rules):
logger.error('%s is not a directory', args.rules)
return -1
# load rule
try:
rules = capa.main.get_rules(args.rules)
rules = list(capa.rules.get_rules_and_dependencies(rules, args.rule_name))
rules = capa.rules.RuleSet(rules)
except IOError as e:
logger.error('%s', str(e))
return -1
except capa.rules.InvalidRule as e:
logger.error('%s', str(e))
return -1
time0 = time.time()
print('[RULE %s]' % args.rule_name)
if os.path.isfile(args.frozen_path):
check_rule(args.frozen_path, rules, args.rule_name, args.only_matching, args.save_image, args.verbose)
try:
# get only freeze files from directory
freeze_files = []
for root, dirs, files in os.walk(args.frozen_path):
for file in files:
if not file.endswith(FREEZE_EXTENSION):
continue
path = os.path.join(root, file)
if args.fast and 'slow' in path:
logger.debug('fast mode skipping %s', path)
continue
freeze_files.append(path)
for path in sorted(freeze_files):
sample_time0 = time.time()
check_rule(path, rules, args.rule_name, args.only_matching, args.save_image, args.verbose)
logger.debug('rule check took %d seconds', time.time() - sample_time0)
except KeyboardInterrupt:
logger.info('Received keyboard interrupt, terminating')
print_summary(args.verbose, time0)
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,131 @@
'''
Start IDA Pro in autonomous mode to dump JSON file of function names { fva: fname }.
Processes a single file or a directory.
Only runs on files with supported file extensions.
Example usage:
start_ida_dump_fnames.py <candidate files dir>
start_ida_dump_fnames.py samples\benign
'''
import os
import sys
import json
import hashlib
import logging
import subprocess
import argparse
from scripts.testbed import FNAMES_EXTENSION
IDA32_PATH = 'C:\\Program Files\\IDA Pro 7.3\\ida.exe'
IDA64_PATH = 'C:\\Program Files\\IDA Pro 7.3\\ida64.exe'
# expected in same directory as this file
DUMP_SCRIPT_PATH = os.path.abspath('_dump_fnames.py')
SUPPORTED_EXTENSIONS = [
'.exe_',
'.dll_',
'.sys_',
'.idb',
'.i64',
]
logger = logging.getLogger(__name__)
def call_ida_dump_script(sample_path, reprocess):
''' call IDA in autonomous mode and return True if success, False on failure '''
logger.info('processing %s (MD5: %s)', sample_path, get_md5_hexdigest(sample_path))
# TODO detect 64-bit binaries
if os.path.splitext(sample_path)[-1] == '.i64':
IDA_PATH = IDA64_PATH
else:
IDA_PATH = IDA32_PATH
if sample_path.endswith('.idb') or sample_path.endswith('.i64'):
sample_path = sample_path[:-4]
fnames = '%s%s' % (sample_path, FNAMES_EXTENSION)
if os.path.exists(fnames) and not reprocess:
logger.info('%s already exists and contains %d function names, provide -r argument to reprocess',
fnames, len(get_function_names(fnames)))
return True
out_path = os.path.split(fnames)[-1] # relative to IDA database file
args = [IDA_PATH, '-A', '-S%s "%s"' % (DUMP_SCRIPT_PATH, out_path), sample_path]
logger.debug('calling "%s"' % ' '.join(args))
subprocess.call(args)
if not os.path.exists(fnames):
logger.warning('%s was not created', fnames)
return False
logger.debug('extracted %d function names to %s', len(get_function_names(fnames)), fnames)
return True
def get_md5_hexdigest(sample_path):
m = hashlib.md5()
with open(sample_path, 'rb') as f:
m.update(f.read())
return m.hexdigest()
def get_function_names(fnames_file):
if not os.path.exists(fnames_file):
return None
with open(fnames_file, 'r') as f:
return json.load(f)
def main():
parser = argparse.ArgumentParser(
description="Launch IDA Pro in autonomous mode to dump function names of a file or of files in a directory")
parser.add_argument("file_path", type=str,
help="File or directory path to analyze")
parser.add_argument("-r", "--reprocess", action="store_true", default=False,
help="Overwrite existing analysis")
parser.add_argument("-v", "--verbose", action="store_true",
help="Enable verbose output")
args = parser.parse_args(args=sys.argv[1:])
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)
if not os.path.exists(args.file_path):
logger.warning('%s does not exist', args.file_path)
return -1
if os.path.isfile(args.file_path):
call_ida_dump_script(args.file_path, args.reprocess)
return 0
errors = 0
logger.info('processing files in %s with file extension %s', args.file_path, '|'.join(SUPPORTED_EXTENSIONS))
for root, dirs, files in os.walk(args.file_path):
for file in files:
if not os.path.splitext(file)[1] in SUPPORTED_EXTENSIONS:
logger.debug('%s does not have supported file extension', file)
continue
path = os.path.join(root, file)
if not call_ida_dump_script(path, args.reprocess):
errors += 1
if errors:
logger.warning('encountered %d errors', errors)
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,135 @@
'''
Start IDA Pro in autonomous mode to export images of function graphs.
Example usage:
start_ida_export_fimages.py <target file> <output dir> -f <function list>
start_ida_export_fimages.py test.exe imgs -f 0x401000,0x402F90
'''
import os
import imp
import sys
import hashlib
import logging
import subprocess
import argparse
try:
imp.find_module('graphviz')
from graphviz import Source
graphviz_found = True
except ImportError:
graphviz_found = False
IDA32_PATH = 'C:\\Program Files\\IDA Pro 7.3\\ida.exe'
IDA64_PATH = 'C:\\Program Files\\IDA Pro 7.3\\ida64.exe'
# expected in same directory as this file
EXPORT_SCRIPT_PATH = os.path.abspath('_export_fimages.py')
logger = logging.getLogger(__name__)
def export_fimages(file_path, out_dir, functions, manual=False):
'''
Export images of function graphs.
:param file_path: file to analyze
:param out_dir: output directory
:param functions: list of strings of hex formatted fvas
:param manual: non-autonomous mode
:return: True on success, False otherwise
'''
if not graphviz_found:
logger.warning('please install graphviz to export images')
return False
if not os.path.exists(out_dir):
os.mkdir(out_dir)
script_args = [os.path.abspath(out_dir)] + functions
call_ida_script(EXPORT_SCRIPT_PATH, script_args, file_path, manual)
img_count = 0
for root, dirs, files in os.walk(out_dir):
for file in files:
if not file.endswith('.dot'):
continue
try:
s = Source.from_file(file, directory=out_dir)
s.render(file, directory=out_dir, format='png', cleanup=True)
img_count += 1
except BaseException:
logger.warning('graphviz error rendering file')
if img_count > 0:
logger.info('exported %d function graph images to "%s"', img_count, os.path.abspath(out_dir))
return True
else:
logger.warning('failed to export function graph images')
return False
def call_ida_script(script_path, script_args, sample_path, manual):
logger.info('processing %s (MD5: %s)', sample_path, get_md5_hexdigest(sample_path))
# TODO detect 64-bit binaries
if os.path.splitext(sample_path)[-1] == '.i64':
IDA_PATH = IDA64_PATH
else:
IDA_PATH = IDA32_PATH
args = [IDA_PATH, '-A', '-S%s %s' % (script_path, ' '.join(script_args)), sample_path]
if manual:
args.remove('-A')
logger.debug('calling "%s"' % ' '.join(args))
if subprocess.call(args) == 0:
return True
else:
return False
def get_md5_hexdigest(sample_path):
m = hashlib.md5()
with open(sample_path, 'rb') as f:
m.update(f.read())
return m.hexdigest()
def main():
parser = argparse.ArgumentParser(
description="Launch IDA Pro in autonomous mode to export images of function graphs")
parser.add_argument("file_path", type=str,
help="File to export from")
parser.add_argument("out_dir", type=str,
help="Export target directory")
parser.add_argument("-f", "--functions", action="store",
help="Comma separated list of functions to export")
parser.add_argument("-m", "--manual", action="store_true",
help="Manual mode: show IDA dialog boxes")
parser.add_argument("-v", "--verbose", action="store_true",
help="Enable verbose output")
args = parser.parse_args(args=sys.argv[1:])
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)
if not os.path.isfile(args.file_path):
logger.warning('%s is not a file', args.file_path)
return -1
functions = args.functions.split(',')
export_fimages(args.file_path, args.out_dir, functions, args.manual)
return 0
if __name__ == "__main__":
sys.exit(main())