diff --git a/capa/features/extractors/viv/__init__.py b/capa/features/extractors/viv/__init__.py index c6511215..47581236 100644 --- a/capa/features/extractors/viv/__init__.py +++ b/capa/features/extractors/viv/__init__.py @@ -8,11 +8,7 @@ import types -import file -import insn -import function import viv_utils -import basicblock import capa.features.extractors import capa.features.extractors.viv.file @@ -42,7 +38,7 @@ def add_va_int_cast(o): this bit of skullduggery lets use cast viv-utils objects as ints. the correct way of doing this is to update viv-utils (or subclass the objects here). """ - setattr(o, "__int__", types.MethodType(get_va, o, type(o))) + setattr(o, "__int__", types.MethodType(get_va, o)) return o diff --git a/capa/features/extractors/viv/basicblock.py b/capa/features/extractors/viv/basicblock.py index 69ae94f6..76ffefe6 100644 --- a/capa/features/extractors/viv/basicblock.py +++ b/capa/features/extractors/viv/basicblock.py @@ -125,11 +125,16 @@ def get_printable_len(oper): def is_printable_ascii(chars): - return all(ord(c) < 127 and c in string.printable for c in chars) + try: + chars_str = chars.decode("ascii") + except UnicodeDecodeError: + return False + else: + return all(c in string.printable for c in chars_str) def is_printable_utf16le(chars): - if all(c == "\x00" for c in chars[1::2]): + if all(c == b"\x00" for c in chars[1::2]): return is_printable_ascii(chars[::2]) diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py index 7b7db57c..6321fcab 100644 --- a/capa/features/extractors/viv/insn.py +++ b/capa/features/extractors/viv/insn.py @@ -239,7 +239,7 @@ def read_bytes(vw, va): """ segm = vw.getSegment(va) if not segm: - raise envi.SegmentationViolation() + raise envi.SegmentationViolation(va) segm_end = segm[0] + segm[1] try: diff --git a/capa/features/freeze.py b/capa/features/freeze.py index 2e6ff8ee..51f11e53 100644 --- a/capa/features/freeze.py +++ b/capa/features/freeze.py @@ -264,6 +264,15 @@ def main(argv=None): parser.add_argument( "-f", "--format", choices=[f[0] for f in formats], default="auto", help="Select sample format, %s" % format_help ) + if sys.version_info >= (3, 0): + parser.add_argument( + "-b", + "--backend", + type=str, + help="select the backend to use", + choices=(capa.main.BACKEND_VIV, capa.main.BACKEND_SMDA), + default=capa.main.BACKEND_VIV, + ) args = parser.parse_args(args=argv) if args.quiet: @@ -276,7 +285,8 @@ def main(argv=None): logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) - extractor = capa.main.get_extractor(args.sample, args.format) + backend = args.backend if sys.version_info > (3, 0) else capa.main.BACKEND_VIV + extractor = capa.main.get_extractor(args.sample, args.format, backend) with open(args.output, "wb") as f: f.write(dump(extractor)) diff --git a/capa/main.py b/capa/main.py index 27d5a9c3..d76c6852 100644 --- a/capa/main.py +++ b/capa/main.py @@ -32,7 +32,9 @@ import capa.features.extractors from capa.helpers import oint, get_file_taste RULES_PATH_DEFAULT_STRING = "(embedded rules)" -SUPPORTED_FILE_MAGIC = set(["MZ"]) +SUPPORTED_FILE_MAGIC = set([b"MZ"]) +BACKEND_VIV = "vivisect" +BACKEND_SMDA = "smda" logger = logging.getLogger("capa") @@ -303,29 +305,43 @@ class UnsupportedRuntimeError(RuntimeError): pass -def get_extractor_py3(path, format, disable_progress=False): - from smda.SmdaConfig import SmdaConfig - from smda.Disassembler import Disassembler +def get_extractor_py3(path, format, backend, disable_progress=False): + if backend == "smda": + from smda.SmdaConfig import SmdaConfig + from smda.Disassembler import Disassembler - import capa.features.extractors.smda + import capa.features.extractors.smda - smda_report = None - with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress): - config = SmdaConfig() - config.STORE_BUFFER = True - smda_disasm = Disassembler(config) - smda_report = smda_disasm.disassembleFile(path) + smda_report = None + with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress): + config = SmdaConfig() + config.STORE_BUFFER = True + smda_disasm = Disassembler(config) + smda_report = smda_disasm.disassembleFile(path) - return capa.features.extractors.smda.SmdaFeatureExtractor(smda_report, path) + return capa.features.extractors.smda.SmdaFeatureExtractor(smda_report, path) + else: + import capa.features.extractors.viv + + with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress): + vw = get_workspace(path, format, should_save=False) + + try: + vw.saveWorkspace() + except IOError: + # see #168 for discussion around how to handle non-writable directories + logger.info("source directory is not writable, won't save intermediate workspace") + + return capa.features.extractors.viv.VivisectFeatureExtractor(vw, path) -def get_extractor(path, format, disable_progress=False): +def get_extractor(path, format, backend, disable_progress=False): """ raises: UnsupportedFormatError: """ if sys.version_info >= (3, 0): - return get_extractor_py3(path, format, disable_progress=disable_progress) + return get_extractor_py3(path, format, backend, disable_progress=disable_progress) else: return get_extractor_py2(path, format, disable_progress=disable_progress) @@ -501,6 +517,15 @@ def main(argv=None): parser.add_argument( "-f", "--format", choices=[f[0] for f in formats], default="auto", help="select sample format, %s" % format_help ) + if sys.version_info >= (3, 0): + parser.add_argument( + "-b", + "--backend", + type=str, + help="select the backend to use", + choices=(BACKEND_VIV, BACKEND_SMDA), + default=BACKEND_VIV, + ) parser.add_argument("-t", "--tag", type=str, help="filter on rule meta field values") parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text") parser.add_argument( @@ -605,7 +630,8 @@ def main(argv=None): else: format = args.format try: - extractor = get_extractor(args.sample, args.format, disable_progress=args.quiet) + backend = args.backend if sys.version_info > (3, 0) else capa.main.BACKEND_VIV + extractor = get_extractor(args.sample, args.format, backend, disable_progress=args.quiet) except UnsupportedFormatError: logger.error("-" * 80) logger.error(" Input file does not appear to be a PE file.") diff --git a/scripts/bulk-process.py b/scripts/bulk-process.py index 75ebaab9..65f7c66f 100644 --- a/scripts/bulk-process.py +++ b/scripts/bulk-process.py @@ -95,7 +95,7 @@ def get_capa_results(args): rules, format, path = args logger.info("computing capa results for: %s", path) try: - extractor = capa.main.get_extractor(path, format, disable_progress=True) + extractor = capa.main.get_extractor(path, format, capa.main.BACKEND_VIV, disable_progress=True) except capa.main.UnsupportedFormatError: # i'm 100% sure if multiprocessing will reliably raise exceptions across process boundaries. # so instead, return an object with explicit success/failure status. diff --git a/scripts/capa_as_library.py b/scripts/capa_as_library.py index 9a3d9749..587e1437 100644 --- a/scripts/capa_as_library.py +++ b/scripts/capa_as_library.py @@ -191,7 +191,7 @@ def render_dictionary(doc): def capa_details(file_path, output_format="dictionary"): # extract features and find capabilities - extractor = capa.main.get_extractor(file_path, "auto", disable_progress=True) + extractor = capa.main.get_extractor(file_path, "auto", capa.main.BACKEND_VIV, disable_progress=True) capabilities, counts = capa.main.find_capabilities(rules, extractor, disable_progress=True) # collect metadata (used only to make rendering more complete) diff --git a/scripts/lint.py b/scripts/lint.py index 20aae6e0..1e177df9 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -201,7 +201,7 @@ class DoesntMatchExample(Lint): continue try: - extractor = capa.main.get_extractor(path, "auto", disable_progress=True) + extractor = capa.main.get_extractor(path, "auto", capa.main.BACKEND_VIV, disable_progress=True) capabilities, meta = capa.main.find_capabilities(ctx["rules"], extractor, disable_progress=True) except Exception as e: logger.error("failed to extract capabilities: %s %s %s", rule.name, path, e) diff --git a/scripts/show-capabilities-by-function.py b/scripts/show-capabilities-by-function.py index 73ab3da3..a8feb35e 100644 --- a/scripts/show-capabilities-by-function.py +++ b/scripts/show-capabilities-by-function.py @@ -199,7 +199,7 @@ def main(argv=None): else: format = args.format try: - extractor = capa.main.get_extractor(args.sample, args.format) + extractor = capa.main.get_extractor(args.sample, args.format, capa.main.BACKEND_VIV) except capa.main.UnsupportedFormatError: logger.error("-" * 80) logger.error(" Input file does not appear to be a PE file.") diff --git a/scripts/show-features.py b/scripts/show-features.py index 60668686..c8f74de9 100644 --- a/scripts/show-features.py +++ b/scripts/show-features.py @@ -125,7 +125,7 @@ def main(argv=None): extractor = capa.features.freeze.load(f.read()) else: try: - extractor = capa.main.get_extractor(args.sample, args.format) + extractor = capa.main.get_extractor(args.sample, args.format, capa.main.BACKEND_VIV) except capa.main.UnsupportedFormatError: logger.error("-" * 80) logger.error(" Input file does not appear to be a PE file.") diff --git a/scripts/vivisect-py2-vs-py3.sh b/scripts/vivisect-py2-vs-py3.sh new file mode 100755 index 00000000..20e7c817 --- /dev/null +++ b/scripts/vivisect-py2-vs-py3.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +int() { + int=$(bc <<< "scale=0; ($1 + 0.5)/1") +} + +export TIMEFORMAT='%3R' +threshold_time=90 +threshold_py3_time=60 # Do not warn if it doesn't take at least 1 minute to run +rm tests/data/*.viv 2>/dev/null +mkdir results +for file in tests/data/* +do + file=$(printf %q "$file") # Handle names with white spaces + file_name=$(basename $file) + echo $file_name + + rm "$file.viv" 2>/dev/null + py3_time=$(sh -c "time python3 scripts/show-features.py $file >> results/p3-$file_name.out 2>/dev/null" 2>&1) + rm "$file.viv" 2>/dev/null + py2_time=$(sh -c "time python2 scripts/show-features.py $file >> results/p2-$file_name.out 2>/dev/null" 2>&1) + + int $py3_time + if (($int > $threshold_py3_time)) + then + percentage=$(bc <<< "scale=3; $py2_time/$py3_time*100 + 0.5") + int $percentage + if (($int < $threshold_py3_time)) + then + echo -n " SLOWER ($percentage): " + fi + fi + echo " PY2($py2_time) PY3($py3_time)" +done + +threshold_features=98 +counter=0 +average=0 +results_for() { + py3=$(cat "results/p3-$file_name.out" | grep "$1" | wc -l) + py2=$(cat "results/p2-$file_name.out" | grep "$1" | wc -l) + if (($py2 > 0)) + then + percentage=$(bc <<< "scale=2; 100*$py3/$py2") + average=$(bc <<< "scale=2; $percentage + $average") + count=$(($count + 1)) + int $percentage + if (($int < $threshold_features)) + then + echo -e "$1: py2($py2) py3($py3) $percentage% - $file_name" + fi + fi +} + +rm tests/data/*.viv 2>/dev/null +echo -e '\nRESULTS:' +for file in tests/data/* +do + file_name=$(basename $file) + if test -f "results/p2-$file_name.out"; then + results_for 'insn' + results_for 'file' + results_for 'func' + results_for 'bb' + fi +done + +average=$(bc <<< "scale=2; $average/$count") +echo "TOTAL: $average" diff --git a/setup.py b/setup.py index 4caaa749..b7c90b3d 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,8 @@ if sys.version_info >= (3, 0): # py3 requirements.append("halo") requirements.append("networkx") + requirements.append("vivisect==1.0.0") + requirements.append("viv-utils==0.3.19") requirements.append("smda==1.5.13") else: # py2 diff --git a/tests/fixtures.py b/tests/fixtures.py index 4261408b..5ef9c3a4 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -520,11 +520,7 @@ def do_test_feature_count(get_extractor, sample, scope, feature, expected): def get_extractor(path): - if sys.version_info >= (3, 0): - extractor = get_smda_extractor(path) - else: - extractor = get_viv_extractor(path) - + extractor = get_viv_extractor(path) # overload the extractor so that the fixture exposes `extractor.path` setattr(extractor, "path", path) return extractor diff --git a/tests/test_main.py b/tests/test_main.py index 783fc95c..6732de2d 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -7,6 +7,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. import sys +import json import textwrap import pytest @@ -365,3 +366,20 @@ def test_not_render_rules_also_matched(z9324d_extractor, capsys): assert "act as TCP client" in std.out assert "connect TCP socket" in std.out assert "create TCP socket" in std.out + + +# It tests main works with different backends +def test_backend_option(capsys): + if sys.version_info > (3, 0): + path = get_data_path_by_name("pma16-01") + assert capa.main.main([path, "-j", "-b", capa.main.BACKEND_VIV]) == 0 + std = capsys.readouterr() + std_json = json.loads(std.out) + assert std_json["meta"]["analysis"]["extractor"] == "VivisectFeatureExtractor" + assert len(std_json["rules"]) > 0 + + assert capa.main.main([path, "-j", "-b", capa.main.BACKEND_SMDA]) == 0 + std = capsys.readouterr() + std_json = json.loads(std.out) + assert std_json["meta"]["analysis"]["extractor"] == "SmdaFeatureExtractor" + assert len(std_json["rules"]) > 0 diff --git a/tests/test_viv_features.py b/tests/test_viv_features.py index 0922d758..3206e8cf 100644 --- a/tests/test_viv_features.py +++ b/tests/test_viv_features.py @@ -16,8 +16,7 @@ from fixtures import * indirect=["sample", "scope"], ) def test_viv_features(sample, scope, feature, expected): - with xfail(sys.version_info >= (3, 0), reason="vivsect only works on py2"): - do_test_feature_presence(get_viv_extractor, sample, scope, feature, expected) + do_test_feature_presence(get_viv_extractor, sample, scope, feature, expected) @parametrize( @@ -26,5 +25,4 @@ def test_viv_features(sample, scope, feature, expected): indirect=["sample", "scope"], ) def test_viv_feature_counts(sample, scope, feature, expected): - with xfail(sys.version_info >= (3, 0), reason="vivsect only works on py2"): - do_test_feature_count(get_viv_extractor, sample, scope, feature, expected) + do_test_feature_count(get_viv_extractor, sample, scope, feature, expected)