merge master

2025-12-22 07:10:29 -08:00 · 2021-03-05 15:23:47 -07:00
parent 6006e87c5e 3e55581bf7
commit 1ee7b7b856
12 changed files with 421 additions and 619 deletions
--- a/.github/pyinstaller/hooks/hook-smda.py
+++ b/.github/pyinstaller/hooks/hook-smda.py
@@ -0,0 +1,5 @@
 # Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
 import PyInstaller.utils.hooks
 # ref: https://groups.google.com/g/pyinstaller/c/amWi0-66uZI/m/miPoKfWjBAAJ
 binaries = PyInstaller.utils.hooks.collect_dynamic_libs("capstone")
--- a/.github/pyinstaller/hooks/hook-vivisect.py
+++ b/.github/pyinstaller/hooks/hook-vivisect.py
@@ -13,3 +13,144 @@ from PyInstaller.utils.hooks import copy_metadata
 #
 # ref: https://github.com/pyinstaller/pyinstaller/issues/1713#issuecomment-162682084
 datas = copy_metadata("vivisect")
 excludedimports = [
    # viv gui requires these heavy libraries,
    # but viv as a library doesn't.
    # they shouldn't be installed in our configuration,
    # but we'll ensure they don't slip in here (such as on developers' systems).
    "PyQt5",
    "qt5",
    "pyqtwebengine",
    # the above are imported by these viv modules.
    # so really, we'd want to exclude these submodules of viv.
    # but i dont think this works.
    "vqt",
    "vdb.qt",
    "envi.qt",
    # unused by capa
    "pyasn1",
 ]
 hiddenimports = [
    # vivisect does manual/runtime importing of its modules,
    # so declare the things that could be imported here.
    "vivisect",
    "vivisect.analysis",
    "vivisect.analysis.amd64",
    "vivisect.analysis.amd64",
    "vivisect.analysis.amd64.emulation",
    "vivisect.analysis.amd64.golang",
    "vivisect.analysis.crypto",
    "vivisect.analysis.crypto",
    "vivisect.analysis.crypto.constants",
    "vivisect.analysis.elf",
    "vivisect.analysis.elf",
    "vivisect.analysis.elf.elfplt",
    "vivisect.analysis.elf.libc_start_main",
    "vivisect.analysis.generic",
    "vivisect.analysis.generic",
    "vivisect.analysis.generic.codeblocks",
    "vivisect.analysis.generic.emucode",
    "vivisect.analysis.generic.entrypoints",
    "vivisect.analysis.generic.funcentries",
    "vivisect.analysis.generic.impapi",
    "vivisect.analysis.generic.mkpointers",
    "vivisect.analysis.generic.pointers",
    "vivisect.analysis.generic.pointertables",
    "vivisect.analysis.generic.relocations",
    "vivisect.analysis.generic.strconst",
    "vivisect.analysis.generic.switchcase",
    "vivisect.analysis.generic.thunks",
    "vivisect.analysis.generic.noret",
    "vivisect.analysis.i386",
    "vivisect.analysis.i386",
    "vivisect.analysis.i386.calling",
    "vivisect.analysis.i386.golang",
    "vivisect.analysis.i386.importcalls",
    "vivisect.analysis.i386.instrhook",
    "vivisect.analysis.i386.thunk_bx",
    "vivisect.analysis.ms",
    "vivisect.analysis.ms",
    "vivisect.analysis.ms.hotpatch",
    "vivisect.analysis.ms.localhints",
    "vivisect.analysis.ms.msvc",
    "vivisect.analysis.ms.msvcfunc",
    "vivisect.analysis.ms.vftables",
    "vivisect.analysis.pe",
    "vivisect.impapi.posix.amd64",
    "vivisect.impapi.posix.i386",
    "vivisect.impapi.windows",
    "vivisect.impapi.windows.amd64",
    "vivisect.impapi.windows.i386",
    "vivisect.impapi.winkern.i386",
    "vivisect.impapi.winkern.amd64",
    "vivisect.parsers.blob",
    "vivisect.parsers.elf",
    "vivisect.parsers.ihex",
    "vivisect.parsers.macho",
    "vivisect.parsers.pe",
    "vivisect.storage",
    "vivisect.storage.basicfile",
    "vstruct.constants",
    "vstruct.constants.ntstatus",
    "vstruct.defs",
    "vstruct.defs.arm7",
    "vstruct.defs.bmp",
    "vstruct.defs.dns",
    "vstruct.defs.elf",
    "vstruct.defs.gif",
    "vstruct.defs.ihex",
    "vstruct.defs.inet",
    "vstruct.defs.java",
    "vstruct.defs.kdcom",
    "vstruct.defs.macho",
    "vstruct.defs.macho.const",
    "vstruct.defs.macho.fat",
    "vstruct.defs.macho.loader",
    "vstruct.defs.macho.stabs",
    "vstruct.defs.minidump",
    "vstruct.defs.pcap",
    "vstruct.defs.pe",
    "vstruct.defs.pptp",
    "vstruct.defs.rar",
    "vstruct.defs.swf",
    "vstruct.defs.win32",
    "vstruct.defs.windows",
    "vstruct.defs.windows.win_5_1_i386",
    "vstruct.defs.windows.win_5_1_i386.ntdll",
    "vstruct.defs.windows.win_5_1_i386.ntoskrnl",
    "vstruct.defs.windows.win_5_1_i386.win32k",
    "vstruct.defs.windows.win_5_2_i386",
    "vstruct.defs.windows.win_5_2_i386.ntdll",
    "vstruct.defs.windows.win_5_2_i386.ntoskrnl",
    "vstruct.defs.windows.win_5_2_i386.win32k",
    "vstruct.defs.windows.win_6_1_amd64",
    "vstruct.defs.windows.win_6_1_amd64.ntdll",
    "vstruct.defs.windows.win_6_1_amd64.ntoskrnl",
    "vstruct.defs.windows.win_6_1_amd64.win32k",
    "vstruct.defs.windows.win_6_1_i386",
    "vstruct.defs.windows.win_6_1_i386.ntdll",
    "vstruct.defs.windows.win_6_1_i386.ntoskrnl",
    "vstruct.defs.windows.win_6_1_i386.win32k",
    "vstruct.defs.windows.win_6_1_wow64",
    "vstruct.defs.windows.win_6_1_wow64.ntdll",
    "vstruct.defs.windows.win_6_2_amd64",
    "vstruct.defs.windows.win_6_2_amd64.ntdll",
    "vstruct.defs.windows.win_6_2_amd64.ntoskrnl",
    "vstruct.defs.windows.win_6_2_amd64.win32k",
    "vstruct.defs.windows.win_6_2_i386",
    "vstruct.defs.windows.win_6_2_i386.ntdll",
    "vstruct.defs.windows.win_6_2_i386.ntoskrnl",
    "vstruct.defs.windows.win_6_2_i386.win32k",
    "vstruct.defs.windows.win_6_2_wow64",
    "vstruct.defs.windows.win_6_2_wow64.ntdll",
    "vstruct.defs.windows.win_6_3_amd64",
    "vstruct.defs.windows.win_6_3_amd64.ntdll",
    "vstruct.defs.windows.win_6_3_amd64.ntoskrnl",
    "vstruct.defs.windows.win_6_3_i386",
    "vstruct.defs.windows.win_6_3_i386.ntdll",
    "vstruct.defs.windows.win_6_3_i386.ntoskrnl",
    "vstruct.defs.windows.win_6_3_wow64",
    "vstruct.defs.windows.win_6_3_wow64.ntdll",
 ]
--- a/.github/pyinstaller/pyinstaller.spec
+++ b/.github/pyinstaller/pyinstaller.spec
@@ -16,9 +16,10 @@ with open('./capa/version.py', 'wb') as f:
    #                 - commits since
    #                   g------- git hash fragment
    version = (subprocess.check_output(["git", "describe", "--always", "--tags", "--long"])
               .decode("utf-8")
               .strip()
               .replace("tags/", ""))
-    f.write("__version__ = '%s'" % version)
+    f.write(("__version__ = '%s'" % version).encode("utf-8"))
 a = Analysis(
    # when invoking pyinstaller from the project root,
@@ -41,128 +42,6 @@ a = Analysis(
        # ref: https://stackoverflow.com/a/62278462/87207
        (os.path.dirname(wcwidth.__file__), 'wcwidth')
    ],
    hiddenimports=[
        # vivisect does manual/runtime importing of its modules,
        # so declare the things that could be imported here.
        "vivisect",
        "vivisect.analysis",
        "vivisect.analysis.amd64",
        "vivisect.analysis.amd64",
        "vivisect.analysis.amd64.emulation",
        "vivisect.analysis.amd64.golang",
        "vivisect.analysis.crypto",
        "vivisect.analysis.crypto",
        "vivisect.analysis.crypto.constants",
        "vivisect.analysis.elf",
        "vivisect.analysis.elf",
        "vivisect.analysis.elf.elfplt",
        "vivisect.analysis.elf.libc_start_main",
        "vivisect.analysis.generic",
        "vivisect.analysis.generic",
        "vivisect.analysis.generic.codeblocks",
        "vivisect.analysis.generic.emucode",
        "vivisect.analysis.generic.entrypoints",
        "vivisect.analysis.generic.funcentries",
        "vivisect.analysis.generic.impapi",
        "vivisect.analysis.generic.mkpointers",
        "vivisect.analysis.generic.pointers",
        "vivisect.analysis.generic.pointertables",
        "vivisect.analysis.generic.relocations",
        "vivisect.analysis.generic.strconst",
        "vivisect.analysis.generic.switchcase",
        "vivisect.analysis.generic.thunks",
        "vivisect.analysis.i386",
        "vivisect.analysis.i386",
        "vivisect.analysis.i386.calling",
        "vivisect.analysis.i386.golang",
        "vivisect.analysis.i386.importcalls",
        "vivisect.analysis.i386.instrhook",
        "vivisect.analysis.i386.thunk_bx",
        "vivisect.analysis.ms",
        "vivisect.analysis.ms",
        "vivisect.analysis.ms.hotpatch",
        "vivisect.analysis.ms.localhints",
        "vivisect.analysis.ms.msvc",
        "vivisect.analysis.ms.msvcfunc",
        "vivisect.analysis.ms.vftables",
        "vivisect.analysis.pe",
        "vivisect.impapi.posix.amd64",
        "vivisect.impapi.posix.i386",
        "vivisect.impapi.windows",
        "vivisect.impapi.windows.amd64",
        "vivisect.impapi.windows.i386",
        "vivisect.impapi.winkern.i386",
        "vivisect.impapi.winkern.amd64",
        "vivisect.parsers.blob",
        "vivisect.parsers.elf",
        "vivisect.parsers.ihex",
        "vivisect.parsers.macho",
        "vivisect.parsers.pe",
        "vivisect.parsers.utils",
        "vivisect.storage",
        "vivisect.storage.basicfile",
        "vstruct.constants",
        "vstruct.constants.ntstatus",
        "vstruct.defs",
        "vstruct.defs.arm7",
        "vstruct.defs.bmp",
        "vstruct.defs.dns",
        "vstruct.defs.elf",
        "vstruct.defs.gif",
        "vstruct.defs.ihex",
        "vstruct.defs.inet",
        "vstruct.defs.java",
        "vstruct.defs.kdcom",
        "vstruct.defs.macho",
        "vstruct.defs.macho.const",
        "vstruct.defs.macho.fat",
        "vstruct.defs.macho.loader",
        "vstruct.defs.macho.stabs",
        "vstruct.defs.minidump",
        "vstruct.defs.pcap",
        "vstruct.defs.pe",
        "vstruct.defs.pptp",
        "vstruct.defs.rar",
        "vstruct.defs.swf",
        "vstruct.defs.win32",
        "vstruct.defs.windows",
        "vstruct.defs.windows.win_5_1_i386",
        "vstruct.defs.windows.win_5_1_i386.ntdll",
        "vstruct.defs.windows.win_5_1_i386.ntoskrnl",
        "vstruct.defs.windows.win_5_1_i386.win32k",
        "vstruct.defs.windows.win_5_2_i386",
        "vstruct.defs.windows.win_5_2_i386.ntdll",
        "vstruct.defs.windows.win_5_2_i386.ntoskrnl",
        "vstruct.defs.windows.win_5_2_i386.win32k",
        "vstruct.defs.windows.win_6_1_amd64",
        "vstruct.defs.windows.win_6_1_amd64.ntdll",
        "vstruct.defs.windows.win_6_1_amd64.ntoskrnl",
        "vstruct.defs.windows.win_6_1_amd64.win32k",
        "vstruct.defs.windows.win_6_1_i386",
        "vstruct.defs.windows.win_6_1_i386.ntdll",
        "vstruct.defs.windows.win_6_1_i386.ntoskrnl",
        "vstruct.defs.windows.win_6_1_i386.win32k",
        "vstruct.defs.windows.win_6_1_wow64",
        "vstruct.defs.windows.win_6_1_wow64.ntdll",
        "vstruct.defs.windows.win_6_2_amd64",
        "vstruct.defs.windows.win_6_2_amd64.ntdll",
        "vstruct.defs.windows.win_6_2_amd64.ntoskrnl",
        "vstruct.defs.windows.win_6_2_amd64.win32k",
        "vstruct.defs.windows.win_6_2_i386",
        "vstruct.defs.windows.win_6_2_i386.ntdll",
        "vstruct.defs.windows.win_6_2_i386.ntoskrnl",
        "vstruct.defs.windows.win_6_2_i386.win32k",
        "vstruct.defs.windows.win_6_2_wow64",
        "vstruct.defs.windows.win_6_2_wow64.ntdll",
        "vstruct.defs.windows.win_6_3_amd64",
        "vstruct.defs.windows.win_6_3_amd64.ntdll",
        "vstruct.defs.windows.win_6_3_amd64.ntoskrnl",
        "vstruct.defs.windows.win_6_3_i386",
        "vstruct.defs.windows.win_6_3_i386.ntdll",
        "vstruct.defs.windows.win_6_3_i386.ntoskrnl",
        "vstruct.defs.windows.win_6_3_wow64",
        "vstruct.defs.windows.win_6_3_wow64.ntdll",
    ],
    # when invoking pyinstaller from the project root,
    # this gets run from the project root.
    hookspath=['.github/pyinstaller/hooks'],
@@ -180,6 +59,25 @@ a = Analysis(
        # since we don't spawn a notebook, we can safely remove these.
        "IPython",
        "ipywidgets",
        # these are pulled in by networkx
        # but we don't need to compute the strongly connected components.
        "numpy",
        "scipy",
        "matplotlib",
        "pandas",
        "pytest",
        # deps from viv that we don't use.
        # this duplicates the entries in `hook-vivisect`,
        # but works better this way.
        "vqt",
        "vdb.qt",
        "envi.qt",
        "PyQt5",
        "qt5",
        "pyqtwebengine",
        "pyasn1"
    ])
 a.binaries = a.binaries - TOC([
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -26,19 +26,14 @@ jobs:
        uses: actions/checkout@v2
        with:
          submodules: true
-      - name: Set up Python 2.7
+      - name: Set up Python 3.9
        uses: actions/setup-python@v2
        with:
-          python-version: 2.7
+          python-version: 3.9
      - if: matrix.os == 'ubuntu-latest'
        run: sudo apt-get install -y libyaml-dev
      - if: matrix.os == 'windows-latest'
        run: |
          choco install vcredist2008
          choco install --ignore-dependencies vcpython27
      - name: Install PyInstaller
-        # pyinstaller 4 doesn't support Python 2.7
+        run: pip install 'pyinstaller==4.2'
        run: pip install 'pyinstaller==3.*'
      - name: Install capa
        run: pip install -e .
      - name: Build standalone executable
--- a/capa/main.py
+++ b/capa/main.py
@@ -530,19 +530,172 @@ def collect_metadata(argv, sample_path, rules_path, format, extractor):
    }
 def install_common_args(parser, wanted=None):
    """
    register a common set of command line arguments for re-use by main & scripts.
    these are things like logging/coloring/etc.
    also enable callers to opt-in to common arguments, like specifying the input sample.
    this routine lets many script use the same language for cli arguments.
    see `handle_common_args` to do common configuration.
    args:
      parser (argparse.ArgumentParser): a parser to update in place, adding common arguments.
      wanted (Set[str]): collection of arguments to opt-into, including:
        - "sample": required positional argument to input file.
        - "format": flag to override file format.
        - "backend": flag to override analysis backend under py3.
        - "rules": flag to override path to capa rules.
        - "tag": flag to override/specify which rules to match.
    """
    if wanted is None:
        wanted = set()
    #
    # common arguments that all scripts will have
    #
    parser.add_argument("--version", action="version", version="%(prog)s {:s}".format(capa.version.__version__))
    parser.add_argument(
        "-v", "--verbose", action="store_true", help="enable verbose result document (no effect with --json)"
    )
    parser.add_argument(
        "-vv", "--vverbose", action="store_true", help="enable very verbose result document (no effect with --json)"
    )
    parser.add_argument("-d", "--debug", action="store_true", help="enable debugging output on STDERR")
    parser.add_argument("-q", "--quiet", action="store_true", help="disable all output but errors")
    parser.add_argument(
        "--color",
        type=str,
        choices=("auto", "always", "never"),
        default="auto",
        help="enable ANSI color codes in results, default: only during interactive session",
    )
    #
    # arguments that may be opted into:
    #
    #   - sample
    #   - format
    #   - rules
    #   - tag
    #
    if "sample" in wanted:
        if sys.version_info >= (3, 0):
            parser.add_argument(
                # Python 3 str handles non-ASCII arguments correctly
                "sample",
                type=str,
                help="path to sample to analyze",
            )
        else:
            parser.add_argument(
                # in #328 we noticed that the sample path is not handled correctly if it contains non-ASCII characters
                # https://stackoverflow.com/a/22947334/ offers a solution and decoding using getfilesystemencoding works
                # in our testing, however other sources suggest `sys.stdin.encoding` (https://stackoverflow.com/q/4012571/)
                "sample",
                type=lambda s: s.decode(sys.getfilesystemencoding()),
                help="path to sample to analyze",
            )
    if "format" in wanted:
        formats = [
            ("auto", "(default) detect file type automatically"),
            ("pe", "Windows PE file"),
            ("sc32", "32-bit shellcode"),
            ("sc64", "64-bit shellcode"),
            ("freeze", "features previously frozen by capa"),
        ]
        format_help = ", ".join(["%s: %s" % (f[0], f[1]) for f in formats])
        parser.add_argument(
            "-f",
            "--format",
            choices=[f[0] for f in formats],
            default="auto",
            help="select sample format, %s" % format_help,
        )
    if "backend" in wanted and sys.version_info >= (3, 0):
        parser.add_argument(
            "-b",
            "--backend",
            type=str,
            help="select the backend to use",
            choices=(BACKEND_VIV, BACKEND_SMDA),
            default=BACKEND_VIV,
        )
    if "rules" in wanted:
        parser.add_argument(
            "-r",
            "--rules",
            type=str,
            default=RULES_PATH_DEFAULT_STRING,
            help="path to rule file or directory, use embedded rules by default",
        )
    if "signatures" in wanted:
        parser.add_argument(
                "--signature",
                action="append",
                dest="signatures",
                type=str,
                default=[],
                help="use the given signatures to identify library functions, file system paths to .sig/.pat files.",
            )
    if "tag" in wanted:
        parser.add_argument("-t", "--tag", type=str, help="filter on rule meta field values")
 def handle_common_args(args):
    """
    handle the global config specified by `install_common_args`,
    such as configuring logging/coloring/etc.
    args:
      args (argparse.Namespace): parsed arguments that included at least `install_common_args` args.
    """
    if args.quiet:
        logging.basicConfig(level=logging.WARNING)
        logging.getLogger().setLevel(logging.WARNING)
    elif args.debug:
        logging.basicConfig(level=logging.DEBUG)
        logging.getLogger().setLevel(logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
        logging.getLogger().setLevel(logging.INFO)
    # disable vivisect-related logging, it's verbose and not relevant for capa users
    set_vivisect_log_level(logging.CRITICAL)
    # py2 doesn't know about cp65001, which is a variant of utf-8 on windows
    # tqdm bails when trying to render the progress bar in this setup.
    # because cp65001 is utf-8, we just map that codepage to the utf-8 codec.
    # see #380 and: https://stackoverflow.com/a/3259271/87207
    import codecs
    codecs.register(lambda name: codecs.lookup("utf-8") if name == "cp65001" else None)
    if args.color == "always":
        colorama.init(strip=False)
    elif args.color == "auto":
        # colorama will detect:
        #  - when on Windows console, and fixup coloring, and
        #  - when not an interactive session, and disable coloring
        # renderers should use coloring and assume it will be stripped out if necessary.
        colorama.init()
    elif args.color == "never":
        colorama.init(strip=True)
    else:
        raise RuntimeError("unexpected --color value: " + args.color)
 def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
    formats = [
        ("auto", "(default) detect file type automatically"),
        ("pe", "Windows PE file"),
        ("sc32", "32-bit shellcode"),
        ("sc64", "64-bit shellcode"),
        ("freeze", "features previously frozen by capa"),
    ]
    format_help = ", ".join(["%s: %s" % (f[0], f[1]) for f in formats])
    desc = "The FLARE team's open-source tool to identify capabilities in executable files."
    epilog = textwrap.dedent(
        """
@@ -575,82 +728,10 @@ def main(argv=None):
    parser = argparse.ArgumentParser(
        description=desc, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter
    )
-
+    install_common_args(parser, {"sample", "format", "backend", "signatures", "rules", "tag"})
    if sys.version_info >= (3, 0):
        parser.add_argument(
            # Python 3 str handles non-ASCII arguments correctly
            "sample",
            type=str,
            help="path to sample to analyze",
        )
    else:
        parser.add_argument(
            # in #328 we noticed that the sample path is not handled correctly if it contains non-ASCII characters
            # https://stackoverflow.com/a/22947334/ offers a solution and decoding using getfilesystemencoding works
            # in our testing, however other sources suggest `sys.stdin.encoding` (https://stackoverflow.com/q/4012571/)
            "sample",
            type=lambda s: s.decode(sys.getfilesystemencoding()),
            help="path to sample to analyze",
        )
    parser.add_argument("--version", action="version", version="%(prog)s {:s}".format(capa.version.__version__))
    parser.add_argument(
        "-r",
        "--rules",
        type=str,
        default=RULES_PATH_DEFAULT_STRING,
        help="path to rule file or directory, use embedded rules by default",
    )
    parser.add_argument(
        "-f", "--format", choices=[f[0] for f in formats], default="auto", help="select sample format, %s" % format_help
    )
    if sys.version_info >= (3, 0):
        parser.add_argument(
            "-b",
            "--backend",
            type=str,
            help="select the backend to use",
            choices=(BACKEND_VIV, BACKEND_SMDA),
            default=BACKEND_VIV,
        )
    parser.add_argument(
        "--signature",
        action="append",
        dest="signatures",
        type=str,
        default=[],
        help="use the given signatures to identify library functions, file system paths to .sig/.pat files.",
    )
    parser.add_argument("-t", "--tag", type=str, help="filter on rule meta field values")
    parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text")
    parser.add_argument(
        "-v", "--verbose", action="store_true", help="enable verbose result document (no effect with --json)"
    )
    parser.add_argument(
        "-vv", "--vverbose", action="store_true", help="enable very verbose result document (no effect with --json)"
    )
    parser.add_argument("-d", "--debug", action="store_true", help="enable debugging output on STDERR")
    parser.add_argument("-q", "--quiet", action="store_true", help="disable all output but errors")
    parser.add_argument(
        "--color",
        type=str,
        choices=("auto", "always", "never"),
        default="auto",
        help="enable ANSI color codes in results, default: only during interactive session",
    )
    args = parser.parse_args(args=argv)
-
+    handle_common_args(args)
    if args.quiet:
        logging.basicConfig(level=logging.WARNING)
        logging.getLogger().setLevel(logging.WARNING)
    elif args.debug:
        logging.basicConfig(level=logging.DEBUG)
        logging.getLogger().setLevel(logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
        logging.getLogger().setLevel(logging.INFO)
    # disable vivisect-related logging, it's verbose and not relevant for capa users
    set_vivisect_log_level(logging.CRITICAL)
    try:
        taste = get_file_taste(args.sample)
@@ -660,14 +741,6 @@ def main(argv=None):
        logger.error("%s", e.args[0])
        return -1
    # py2 doesn't know about cp65001, which is a variant of utf-8 on windows
    # tqdm bails when trying to render the progress bar in this setup.
    # because cp65001 is utf-8, we just map that codepage to the utf-8 codec.
    # see #380 and: https://stackoverflow.com/a/3259271/87207
    import codecs
    codecs.register(lambda name: codecs.lookup("utf-8") if name == "cp65001" else None)
    if args.rules == RULES_PATH_DEFAULT_STRING:
        logger.debug("-" * 80)
        logger.debug(" Using default embedded rules.")
@@ -724,7 +797,7 @@ def main(argv=None):
    else:
        format = args.format
        try:
-            backend = args.backend if sys.version_info > (3, 0) else capa.main.BACKEND_VIV
+            backend = args.backend if sys.version_info > (3, 0) else BACKEND_VIV
            extractor = get_extractor(args.sample, args.format, backend, args.signatures, disable_progress=args.quiet)
        except UnsupportedFormatError:
            logger.error("-" * 80)
@@ -758,19 +831,6 @@ def main(argv=None):
        if not (args.verbose or args.vverbose or args.json):
            return -1
    if args.color == "always":
        colorama.init(strip=False)
    elif args.color == "auto":
        # colorama will detect:
        #  - when on Windows console, and fixup coloring, and
        #  - when not an interactive session, and disable coloring
        # renderers should use coloring and assume it will be stripped out if necessary.
        colorama.init()
    elif args.color == "never":
        colorama.init(strip=True)
    else:
        raise RuntimeError("unexpected --color value: " + args.color)
    if args.json:
        print(capa.render.render_json(meta, rules, capabilities))
    elif args.vverbose:
--- a/scripts/bulk-process.py
+++ b/scripts/bulk-process.py
@@ -65,6 +65,7 @@ import multiprocessing.pool
 import capa
 import capa.main
 import capa.rules
 import capa.render
 logger = logging.getLogger("capa")
@@ -139,50 +140,14 @@ def main(argv=None):
        argv = sys.argv[1:]
        parser = argparse.ArgumentParser(description="detect capabilities in programs.")
        capa.main.install_common_args(parser, wanted={"rules", "signatures"})
        parser.add_argument("input", type=str, help="Path to directory of files to recursively analyze")
        parser.add_argument(
            "-r",
            "--rules",
            type=str,
            default="(embedded rules)",
            help="Path to rule file or directory, use embedded rules by default",
        )
        parser.add_argument(
            "--signature",
            action="append",
            dest="signatures",
            type=str,
            default=[],
            help="use the given signatures to identify library functions, file system paths to .sig/.pat files.",
        )
        parser.add_argument("-d", "--debug", action="store_true", help="Enable debugging output on STDERR")
        parser.add_argument("-q", "--quiet", action="store_true", help="Disable all output but errors")
        parser.add_argument(
            "-n", "--parallelism", type=int, default=multiprocessing.cpu_count(), help="parallelism factor"
        )
        parser.add_argument("--no-mp", action="store_true", help="disable subprocesses")
        args = parser.parse_args(args=argv)
-
+        capa.main.handle_common_args(args)
        if args.quiet:
            logging.basicConfig(level=logging.ERROR)
            logging.getLogger().setLevel(logging.ERROR)
        elif args.debug:
            logging.basicConfig(level=logging.DEBUG)
            logging.getLogger().setLevel(logging.DEBUG)
        else:
            logging.basicConfig(level=logging.INFO)
            logging.getLogger().setLevel(logging.INFO)
        # disable vivisect-related logging, it's verbose and not relevant for capa users
        capa.main.set_vivisect_log_level(logging.CRITICAL)
        # py2 doesn't know about cp65001, which is a variant of utf-8 on windows
        # tqdm bails when trying to render the progress bar in this setup.
        # because cp65001 is utf-8, we just map that codepage to the utf-8 codec.
        # see #380 and: https://stackoverflow.com/a/3259271/87207
        import codecs
        codecs.register(lambda name: codecs.lookup("utf-8") if name == "cp65001" else None)
        if args.rules == "(embedded rules)":
            logger.info("using default embedded rules")
--- a/scripts/capa_as_library.py
+++ b/scripts/capa_as_library.py
@@ -6,6 +6,7 @@ import collections
 import capa.main
 import capa.rules
 import capa.engine
 import capa.render
 import capa.features
 import capa.render.utils as rutils
 from capa.engine import *
--- a/scripts/import-to-ida.py
+++ b/scripts/import-to-ida.py
@@ -31,10 +31,8 @@ See the License for the specific language governing permissions and limitations
 import json
 import logging
 import idc
 import idautils
 import ida_funcs
 import ida_idaapi
 import ida_kernwin
 logger = logging.getLogger("capa")
--- a/scripts/lint.py
+++ b/scripts/lint.py
@@ -322,7 +322,7 @@ class FormatIncorrect(Lint):
        expected = capa.rules.Rule.from_yaml(rule.definition, use_ruamel=True).to_yaml()
        if actual != expected:
-            diff = difflib.ndiff(actual.splitlines(1), expected.splitlines(1))
+            diff = difflib.ndiff(actual.splitlines(1), expected.splitlines(True))
            self.recommendation = self.recommendation_template.format("".join(diff))
            return True
@@ -557,6 +557,7 @@ def main(argv=None):
    samples_path = os.path.join(os.path.dirname(__file__), "..", "tests", "data")
    parser = argparse.ArgumentParser(description="A program.")
    capa.main.install_common_args(parser, wanted={"tag"})
    parser.add_argument("rules", type=str, help="Path to rules")
    parser.add_argument("--samples", type=str, default=samples_path, help="Path to samples")
    parser.add_argument(
@@ -564,22 +565,9 @@ def main(argv=None):
        action="store_true",
        help="Enable thorough linting - takes more time, but does a better job",
    )
    parser.add_argument("-t", "--tag", type=str, help="filter on rule meta field values")
    parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
    parser.add_argument("-q", "--quiet", action="store_true", help="Disable all output but errors")
    args = parser.parse_args(args=argv)
    capa.main.handle_common_args(args)
    if args.verbose:
        level = logging.DEBUG
    elif args.quiet:
        level = logging.ERROR
    else:
        level = logging.INFO
    logging.basicConfig(level=level)
    logging.getLogger("capa.lint").setLevel(level)
    capa.main.set_vivisect_log_level(logging.CRITICAL)
    logging.getLogger("capa").setLevel(logging.CRITICAL)
    logging.getLogger("viv_utils").setLevel(logging.CRITICAL)
--- a/scripts/migrate-rules.py
+++ b/scripts/migrate-rules.py
@@ -1,167 +0,0 @@
 #!/usr/bin/env python
 """
 migrate rules and their namespaces.
 example:
    $ python scripts/migrate-rules.py migration.csv ./rules ./new-rules
 Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at: [package root]/LICENSE.txt
 Unless required by applicable law or agreed to in writing, software distributed under the License
 is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and limitations under the License.
 """
 import os
 import csv
 import sys
 import logging
 import os.path
 import argparse
 import collections
 import capa.rules
 logger = logging.getLogger("migrate-rules")
 def read_plan(plan_path):
    with open(plan_path, "rb") as f:
        return list(
            csv.DictReader(
                f,
                restkey="other",
                fieldnames=(
                    "existing path",
                    "existing name",
                    "existing rule-category",
                    "proposed name",
                    "proposed namespace",
                    "ATT&CK",
                    "MBC",
                    "comment1",
                ),
            )
        )
 def read_rules(rule_directory):
    rules = {}
    for root, dirs, files in os.walk(rule_directory):
        for file in files:
            path = os.path.join(root, file)
            if not path.endswith(".yml"):
                logger.info("skipping file: %s", path)
                continue
            rule = capa.rules.Rule.from_yaml_file(path)
            rules[rule.name] = rule
            if "nursery" in path:
                rule.meta["capa/nursery"] = True
    return rules
 def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
    parser = argparse.ArgumentParser(description="migrate rules.")
    parser.add_argument("plan", type=str, help="Path to CSV describing migration")
    parser.add_argument("source", type=str, help="Source directory of rules")
    parser.add_argument("destination", type=str, help="Destination directory of rules")
    args = parser.parse_args(args=argv)
    logging.basicConfig(level=logging.INFO)
    logging.getLogger().setLevel(logging.INFO)
    plan = read_plan(args.plan)
    logger.info("read %d plan entries", len(plan))
    rules = read_rules(args.source)
    logger.info("read %d rules", len(rules))
    planned_rules = set([row["existing name"] for row in plan])
    unplanned_rules = [rule for (name, rule) in rules.items() if name not in planned_rules]
    if unplanned_rules:
        logger.error("plan does not account for %d rules:" % (len(unplanned_rules)))
        for rule in unplanned_rules:
            logger.error("  " + rule.name)
        return -1
    # pairs of strings (needle, replacement)
    match_translations = []
    for row in plan:
        if not row["existing name"]:
            continue
        rule = rules[row["existing name"]]
        if rule.meta["name"] != row["proposed name"]:
            logger.info("renaming rule '%s' -> '%s'", rule.meta["name"], row["proposed name"])
            # assume the yaml is formatted like `- match: $rule-name`.
            # but since its been linted, this should be ok.
            match_translations.append(("- match: " + rule.meta["name"], "- match: " + row["proposed name"]))
            rule.meta["name"] = row["proposed name"]
            rule.name = row["proposed name"]
        if "rule-category" in rule.meta:
            logger.info("deleting rule category '%s'", rule.meta["rule-category"])
            del rule.meta["rule-category"]
        rule.meta["namespace"] = row["proposed namespace"]
        if row["ATT&CK"] != "n/a" and row["ATT&CK"] != "":
            tag = row["ATT&CK"]
            name, _, id = tag.rpartition(" ")
            tag = "%s [%s]" % (name, id)
            rule.meta["att&ck"] = [tag]
        if row["MBC"] != "n/a" and row["MBC"] != "":
            tag = row["MBC"]
            rule.meta["mbc"] = [tag]
    for rule in rules.values():
        filename = rule.name
        filename = filename.lower()
        filename = filename.replace(" ", "-")
        filename = filename.replace("(", "")
        filename = filename.replace(")", "")
        filename = filename.replace("+", "")
        filename = filename.replace("/", "")
        filename = filename + ".yml"
        try:
            if rule.meta.get("capa/nursery"):
                directory = os.path.join(args.destination, "nursery")
            elif rule.meta.get("lib"):
                directory = os.path.join(args.destination, "lib")
            else:
                directory = os.path.join(args.destination, rule.meta.get("namespace"))
            os.makedirs(directory)
        except OSError:
            pass
        else:
            logger.info("created namespace: %s", directory)
        path = os.path.join(directory, filename)
        logger.info("writing rule %s", path)
        doc = rule.to_yaml().decode("utf-8")
        for (needle, replacement) in match_translations:
            doc = doc.replace(needle, replacement)
        with open(path, "wb") as f:
            f.write(doc.encode("utf-8"))
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/show-capabilities-by-function.py
+++ b/scripts/show-capabilities-by-function.py
@@ -110,151 +110,94 @@ def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
-        formats = [
+    parser = argparse.ArgumentParser(description="detect capabilities in programs.")
-            ("auto", "(default) detect file type automatically"),
+    capa.main.install_common_args(parser, wanted={"format", "backend", "sample", "signatures", "rules", "tag"})
-            ("pe", "Windows PE file"),
+    args = parser.parse_args(args=argv)
-            ("sc32", "32-bit shellcode"),
+    capa.main.handle_common_args(args)
            ("sc64", "64-bit shellcode"),
            ("freeze", "features previously frozen by capa"),
        ]
        format_help = ", ".join(["%s: %s" % (f[0], f[1]) for f in formats])
-        parser = argparse.ArgumentParser(description="detect capabilities in programs.")
+    try:
-        parser.add_argument("sample", type=str, help="Path to sample to analyze")
+        taste = get_file_taste(args.sample)
-        parser.add_argument(
+    except IOError as e:
-            "-r",
+        logger.error("%s", str(e))
-            "--rules",
+        return -1
            type=str,
            default="(embedded rules)",
            help="Path to rule file or directory, use embedded rules by default",
        )
        parser.add_argument("-t", "--tag", type=str, help="Filter on rule meta field values")
        parser.add_argument("-d", "--debug", action="store_true", help="Enable debugging output on STDERR")
        parser.add_argument("-q", "--quiet", action="store_true", help="Disable all output but errors")
        parser.add_argument(
            "-f",
            "--format",
            choices=[f[0] for f in formats],
            default="auto",
            help="Select sample format, %s" % format_help,
        )
        parser.add_argument(
            "--signature",
            action="append",
            dest="signatures",
            type=str,
            default=[],
            help="use the given signatures to identify library functions, file system paths to .sig/.pat files.",
        )
        args = parser.parse_args(args=argv)
-        if args.quiet:
+    if args.rules == "(embedded rules)":
-            logging.basicConfig(level=logging.ERROR)
+        logger.info("-" * 80)
-            logging.getLogger().setLevel(logging.ERROR)
+        logger.info(" Using default embedded rules.")
-        elif args.debug:
+        logger.info(" To provide your own rules, use the form `capa.exe -r ./path/to/rules/  /path/to/mal.exe`.")
-            logging.basicConfig(level=logging.DEBUG)
+        logger.info(" You can see the current default rule set here:")
-            logging.getLogger().setLevel(logging.DEBUG)
+        logger.info("     https://github.com/fireeye/capa-rules")
-        else:
+        logger.info("-" * 80)
            logging.basicConfig(level=logging.INFO)
            logging.getLogger().setLevel(logging.INFO)
-        # disable vivisect-related logging, it's verbose and not relevant for capa users
+        logger.debug("detected running from source")
-        capa.main.set_vivisect_log_level(logging.CRITICAL)
+        args.rules = os.path.join(os.path.dirname(__file__), "..", "rules")
        logger.debug("default rule path (source method): %s", args.rules)
    else:
        logger.info("using rules path: %s", args.rules)
    try:
        rules = capa.main.get_rules(args.rules)
        rules = capa.rules.RuleSet(rules)
        logger.info("successfully loaded %s rules", len(rules))
        if args.tag:
            rules = rules.filter_rules_by_meta(args.tag)
            logger.info("selected %s rules", len(rules))
    except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
        logger.error("%s", str(e))
        return -1
    if (args.format == "freeze") or (args.format == "auto" and capa.features.freeze.is_freeze(taste)):
        format = "freeze"
        with open(args.sample, "rb") as f:
            extractor = capa.features.freeze.load(f.read())
    else:
        format = args.format
        try:
-            taste = get_file_taste(args.sample)
+            extractor = capa.main.get_extractor(args.sample, args.format, args.backend, args.signatures)
-        except IOError as e:
+        except capa.main.UnsupportedFormatError:
-            logger.error("%s", str(e))
+            logger.error("-" * 80)
            logger.error(" Input file does not appear to be a PE file.")
            logger.error(" ")
            logger.error(
                " capa currently only supports analyzing PE files (or shellcode, when using --format sc32|sc64)."
            )
            logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.")
            logger.error("-" * 80)
            return -1
        except capa.main.UnsupportedRuntimeError:
            logger.error("-" * 80)
            logger.error(" Unsupported runtime or Python interpreter.")
            logger.error(" ")
            logger.error(" capa supports running under Python 2.7 using Vivisect for binary analysis.")
            logger.error(" It can also run within IDA Pro, using either Python 2.7 or 3.5+.")
            logger.error(" ")
            logger.error(" If you're seeing this message on the command line, please ensure you're running Python 2.7.")
            logger.error("-" * 80)
            return -1
-        # py2 doesn't know about cp65001, which is a variant of utf-8 on windows
+    meta = capa.main.collect_metadata(argv, args.sample, args.rules, format, extractor)
-        # tqdm bails when trying to render the progress bar in this setup.
+    capabilities, counts = capa.main.find_capabilities(rules, extractor)
-        # because cp65001 is utf-8, we just map that codepage to the utf-8 codec.
+    meta["analysis"].update(counts)
        # see #380 and: https://stackoverflow.com/a/3259271/87207
        import codecs
-        codecs.register(lambda name: codecs.lookup("utf-8") if name == "cp65001" else None)
+    if capa.main.has_file_limitation(rules, capabilities):
-
+        # bail if capa encountered file limitation e.g. a packed binary
-        if args.rules == "(embedded rules)":
+        # do show the output in verbose mode, though.
-            logger.info("-" * 80)
+        if not (args.verbose or args.vverbose or args.json):
            logger.info(" Using default embedded rules.")
            logger.info(" To provide your own rules, use the form `capa.exe -r ./path/to/rules/  /path/to/mal.exe`.")
            logger.info(" You can see the current default rule set here:")
            logger.info("     https://github.com/fireeye/capa-rules")
            logger.info("-" * 80)
            logger.debug("detected running from source")
            args.rules = os.path.join(os.path.dirname(__file__), "..", "rules")
            logger.debug("default rule path (source method): %s", args.rules)
        else:
            logger.info("using rules path: %s", args.rules)
        try:
            rules = capa.main.get_rules(args.rules)
            rules = capa.rules.RuleSet(rules)
            logger.info("successfully loaded %s rules", len(rules))
            if args.tag:
                rules = rules.filter_rules_by_meta(args.tag)
                logger.info("selected %s rules", len(rules))
        except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
            logger.error("%s", str(e))
            return -1
-        if (args.format == "freeze") or (args.format == "auto" and capa.features.freeze.is_freeze(taste)):
+    # colorama will detect:
-            format = "freeze"
+    #  - when on Windows console, and fixup coloring, and
-            with open(args.sample, "rb") as f:
+    #  - when not an interactive session, and disable coloring
-                extractor = capa.features.freeze.load(f.read())
+    # renderers should use coloring and assume it will be stripped out if necessary.
-        else:
+    colorama.init()
-            format = args.format
+    doc = capa.render.convert_capabilities_to_result_document(meta, rules, capabilities)
-            try:
+    print(render_matches_by_function(doc))
-                extractor = capa.main.get_extractor(args.sample, args.format, capa.main.BACKEND_VIV, args.signatures)
+    colorama.deinit()
            except capa.main.UnsupportedFormatError:
                logger.error("-" * 80)
                logger.error(" Input file does not appear to be a PE file.")
                logger.error(" ")
                logger.error(
                    " capa currently only supports analyzing PE files (or shellcode, when using --format sc32|sc64)."
                )
                logger.error(
                    " If you don't know the input file type, you can try using the `file` utility to guess it."
                )
                logger.error("-" * 80)
                return -1
            except capa.main.UnsupportedRuntimeError:
                logger.error("-" * 80)
                logger.error(" Unsupported runtime or Python interpreter.")
                logger.error(" ")
                logger.error(" capa supports running under Python 2.7 using Vivisect for binary analysis.")
                logger.error(" It can also run within IDA Pro, using either Python 2.7 or 3.5+.")
                logger.error(" ")
                logger.error(
                    " If you're seeing this message on the command line, please ensure you're running Python 2.7."
                )
                logger.error("-" * 80)
                return -1
-        meta = capa.main.collect_metadata(argv, args.sample, args.rules, format, extractor)
+    logger.info("done.")
        capabilities, counts = capa.main.find_capabilities(rules, extractor)
        meta["analysis"].update(counts)
-        if capa.main.has_file_limitation(rules, capabilities):
+    return 0
            # bail if capa encountered file limitation e.g. a packed binary
            # do show the output in verbose mode, though.
            if not (args.verbose or args.vverbose or args.json):
                return -1
        # colorama will detect:
        #  - when on Windows console, and fixup coloring, and
        #  - when not an interactive session, and disable coloring
        # renderers should use coloring and assume it will be stripped out if necessary.
        colorama.init()
        doc = capa.render.convert_capabilities_to_result_document(meta, rules, capabilities)
        print(render_matches_by_function(doc))
        colorama.deinit()
        logger.info("done.")
        return 0
 if __name__ == "__main__":
--- a/scripts/show-features.py
+++ b/scripts/show-features.py
@@ -82,37 +82,12 @@ def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
    formats = [
        ("auto", "(default) detect file type automatically"),
        ("pe", "Windows PE file"),
        ("sc32", "32-bit shellcode"),
        ("sc64", "64-bit shellcode"),
        ("freeze", "features previously frozen by capa"),
    ]
    format_help = ", ".join(["%s: %s" % (f[0], f[1]) for f in formats])
    parser = argparse.ArgumentParser(description="Show the features that capa extracts from the given sample")
-    parser.add_argument("sample", type=str, help="Path to sample to analyze")
+    capa.main.install_common_args(parser, wanted={"format", "sample"})
-    parser.add_argument(
+
        "-f", "--format", choices=[f[0] for f in formats], default="auto", help="Select sample format, %s" % format_help
    )
    parser.add_argument("-F", "--function", type=lambda x: int(x, 0x10), help="Show features for specific function")
    parser.add_argument("-d", "--debug", action="store_true", help="Enable debugging output on STDERR")
    parser.add_argument("-q", "--quiet", action="store_true", help="Disable all output but errors")
    args = parser.parse_args(args=argv)
-
+    capa.main.handle_common_args(args)
    if args.quiet:
        logging.basicConfig(level=logging.ERROR)
        logging.getLogger().setLevel(logging.ERROR)
    elif args.debug:
        logging.basicConfig(level=logging.DEBUG)
        logging.getLogger().setLevel(logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
        logging.getLogger().setLevel(logging.INFO)
    # disable vivisect-related logging, it's verbose and not relevant for capa users
    capa.main.set_vivisect_log_level(logging.CRITICAL)
    try:
        taste = capa.helpers.get_file_taste(args.sample)