Merge pull request #828 from mandiant/profiling

profile infrastructure
This commit is contained in:
Willi Ballenthin
2021-11-09 16:09:34 -07:00
committed by GitHub
8 changed files with 208 additions and 0 deletions

3
.gitignore vendored
View File

@@ -115,3 +115,6 @@ isort-output.log
black-output.log
rule-linter-output.log
.vscode
scripts/perf/*.txt
scripts/perf/*.svg
scripts/perf/*.zip

View File

@@ -20,6 +20,8 @@
### Development
- add profiling infrastructure #828 @williballenthin
### Raw diffs
- [capa <release>...master](https://github.com/mandiant/capa/compare/v3.0.3...master)
- [capa-rules <release>...master](https://github.com/mandiant/capa-rules/compare/v3.0.3...master)

View File

@@ -10,6 +10,7 @@ import copy
import collections
from typing import Set, Dict, List, Tuple, Union, Mapping, Iterable
import capa.perf
import capa.rules
import capa.features.common
from capa.features.common import Result, Feature
@@ -79,6 +80,9 @@ class And(Statement):
self.children = children
def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.and"] += 1
results = [child.evaluate(ctx) for child in self.children]
success = all(results)
return Result(success, self, results)
@@ -92,6 +96,9 @@ class Or(Statement):
self.children = children
def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.or"] += 1
results = [child.evaluate(ctx) for child in self.children]
success = any(results)
return Result(success, self, results)
@@ -105,6 +112,9 @@ class Not(Statement):
self.child = child
def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.not"] += 1
results = [self.child.evaluate(ctx)]
success = not results[0]
return Result(success, self, results)
@@ -119,6 +129,9 @@ class Some(Statement):
self.children = children
def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.some"] += 1
results = [child.evaluate(ctx) for child in self.children]
# note that here we cast the child result as a bool
# because we've overridden `__bool__` above.
@@ -138,6 +151,9 @@ class Range(Statement):
self.max = max if max is not None else (1 << 64 - 1)
def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.range"] += 1
count = len(ctx.get(self.child, []))
if self.min == 0 and count == 0:
return Result(True, self, [])

View File

@@ -16,6 +16,7 @@ if TYPE_CHECKING:
# circular import, otherwise
import capa.engine
import capa.perf
import capa.features
import capa.features.extractors.elf
@@ -146,6 +147,8 @@ class Feature:
return str(self)
def evaluate(self, ctx: Dict["Feature", Set[int]]) -> Result:
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature." + self.name] += 1
return Result(self in ctx, self, [], locations=ctx.get(self, []))
def freeze_serialize(self):
@@ -190,6 +193,9 @@ class Substring(String):
self.value = value
def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.substring"] += 1
# mapping from string value to list of locations.
# will unique the locations later on.
matches = collections.defaultdict(list)
@@ -275,6 +281,9 @@ class Regex(String):
)
def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.regex"] += 1
# mapping from string value to list of locations.
# will unique the locations later on.
matches = collections.defaultdict(list)
@@ -358,6 +367,9 @@ class Bytes(Feature):
self.value = value
def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.bytes"] += 1
for feature, locations in ctx.items():
if not isinstance(feature, (Bytes,)):
continue

View File

@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations
"""
import os
import sys
import time
import hashlib
import logging
import os.path
@@ -17,6 +18,7 @@ import argparse
import datetime
import textwrap
import itertools
import contextlib
import collections
from typing import Any, Dict, List, Tuple
@@ -26,6 +28,7 @@ import colorama
from pefile import PEFormatError
from elftools.common.exceptions import ELFError
import capa.perf
import capa.rules
import capa.engine
import capa.version
@@ -65,6 +68,14 @@ E_UNSUPPORTED_IDA_VERSION = -19
logger = logging.getLogger("capa")
@contextlib.contextmanager
def timing(msg: str):
t0 = time.time()
yield
t1 = time.time()
logger.debug("perf: %s: %0.2fs", msg, t1 - t0)
def set_vivisect_log_level(level):
logging.getLogger("vivisect").setLevel(level)
logging.getLogger("vivisect.base").setLevel(level)
@@ -892,6 +903,7 @@ def main(argv=None):
try:
rules = get_rules(args.rules, disable_progress=args.quiet)
rules = capa.rules.RuleSet(rules)
logger.debug(
"successfully loaded %s rules",
# during the load of the RuleSet, we extract subscope statements into their own rules

10
capa/perf.py Normal file
View File

@@ -0,0 +1,10 @@
import collections
from typing import Dict
# this structure is unstable and may change before the next major release.
counters: Dict[str, int] = collections.Counter()
def reset():
global counters
counters = collections.Counter()

View File

@@ -27,6 +27,7 @@ from typing import Any, Dict, List, Union, Iterator
import yaml
import ruamel.yaml
import capa.perf
import capa.engine as ceng
import capa.features
import capa.features.file
@@ -620,6 +621,8 @@ class Rule:
yield new_rule
def evaluate(self, features: FeatureSet):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.rule"] += 1
return self.statement.evaluate(features)
@classmethod

150
scripts/profile-time.py Normal file
View File

@@ -0,0 +1,150 @@
"""
Invoke capa multiple times and record profiling informations.
Use the --number and --repeat options to change the number of iterations.
By default, the script will emit a markdown table with a label pulled from git.
Note: you can run this script against pre-generated .frz files to reduce the startup time.
usage:
usage: profile-time.py [--number NUMBER] [--repeat REPEAT] [--label LABEL] sample
Profile capa performance
positional arguments:
sample path to sample to analyze
optional arguments:
--number NUMBER batch size of profile collection
--repeat REPEAT batch count of profile collection
--label LABEL description of the profile collection
example:
$ python profile-time.py ./tests/data/kernel32.dll_.frz --number 1 --repeat 2
| label | count(evaluations) | avg(time) | min(time) | max(time) |
|--------------------------------------|----------------------|-------------|-------------|-------------|
| 18c30e4 main: remove perf debug msgs | 66,561,622 | 132.13s | 125.14s | 139.12s |
^^^ --label or git hash
"""
import sys
import timeit
import logging
import argparse
import subprocess
import tqdm
import tabulate
import capa.main
import capa.perf
import capa.rules
import capa.engine
import capa.helpers
import capa.features
import capa.features.common
import capa.features.freeze
logger = logging.getLogger("capa.profile")
def main(argv=None):
if argv is None:
argv = sys.argv[1:]
label = subprocess.run(
"git show --pretty=oneline --abbrev-commit | head -n 1", shell=True, capture_output=True, text=True
).stdout.strip()
is_dirty = (
subprocess.run(
"git status | grep 'modified: ' | grep -v 'rules' | grep -v 'tests/data'",
shell=True,
capture_output=True,
text=True,
).stdout
!= ""
)
if is_dirty:
label += " (dirty)"
parser = argparse.ArgumentParser(description="Profile capa performance")
capa.main.install_common_args(parser, wanted={"format", "sample", "signatures", "rules"})
parser.add_argument("--number", type=int, default=3, help="batch size of profile collection")
parser.add_argument("--repeat", type=int, default=30, help="batch count of profile collection")
parser.add_argument("--label", type=str, default=label, help="description of the profile collection")
args = parser.parse_args(args=argv)
capa.main.handle_common_args(args)
try:
taste = capa.helpers.get_file_taste(args.sample)
except IOError as e:
logger.error("%s", str(e))
return -1
try:
with capa.main.timing("load rules"):
rules = capa.rules.RuleSet(capa.main.get_rules(args.rules, disable_progress=True))
except (IOError) as e:
logger.error("%s", str(e))
return -1
try:
sig_paths = capa.main.get_signatures(args.signatures)
except (IOError) as e:
logger.error("%s", str(e))
return -1
if (args.format == "freeze") or (args.format == "auto" and capa.features.freeze.is_freeze(taste)):
with open(args.sample, "rb") as f:
extractor = capa.features.freeze.load(f.read())
else:
extractor = capa.main.get_extractor(
args.sample, args.format, capa.main.BACKEND_VIV, sig_paths, should_save_workspace=False
)
with tqdm.tqdm(total=args.number * args.repeat) as pbar:
def do_iteration():
capa.perf.reset()
capa.main.find_capabilities(rules, extractor, disable_progress=True)
pbar.update(1)
samples = timeit.repeat(do_iteration, number=args.number, repeat=args.repeat)
logger.debug("perf: find capabilities: min: %0.2fs" % (min(samples) / float(args.number)))
logger.debug("perf: find capabilities: avg: %0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)))
logger.debug("perf: find capabilities: max: %0.2fs" % (max(samples) / float(args.number)))
for (counter, count) in capa.perf.counters.most_common():
logger.debug("perf: counter: {:}: {:,}".format(counter, count))
print(
tabulate.tabulate(
[
(
args.label,
"{:,}".format(capa.perf.counters["evaluate.feature"]),
# python documentation indicates that min(samples) should be preferred,
# so lets put that first.
#
# https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat
"%0.2fs" % (min(samples) / float(args.number)),
"%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)),
"%0.2fs" % (max(samples) / float(args.number)),
)
],
headers=["label", "count(evaluations)", "min(time)", "avg(time)", "max(time)"],
tablefmt="github",
)
)
return 0
if __name__ == "__main__":
sys.exit(main())