capa/scripts/lint.py

'''
Check the given capa rules for style issues.

Usage:

   $ python scripts/lint.py rules/
'''
import os
import os.path
import sys
import string
import hashlib
import logging
import os.path
import itertools

import argparse

import capa.main
import capa.engine
import capa.features

logger = logging.getLogger('capa.lint')


class Lint(object):
    name = 'lint'
    recommendation = ''

    def check_rule(self, ctx, rule):
        return False


class NameCasing(Lint):
    name = 'rule name casing'
    recommendation = 'Rename rule using to start with lower case letters'

    def check_rule(self, ctx, rule):
        return (rule.name[0] in string.ascii_uppercase and
                rule.name[1] not in string.ascii_uppercase)


class FilenameDoesntMatchRuleName(Lint):
    name = 'filename doesn\'t match the rule name'
    recommendation = 'Rename rule file to match the rule name'

    def check_rule(self, ctx, rule):
        expected = rule.name
        expected = expected.lower()
        expected = expected.replace(" ", "-")
        expected = expected.replace("(", "")
        expected = expected.replace(")", "")
        expected = expected.replace("+", "")
        expected = expected.replace("/", "")
        expected = expected + ".yml"

        found = os.path.basename(rule.meta['capa/path'])

        return expected != found


class MissingNamespace(Lint):
    name = 'missing rule namespace'
    recommendation = 'Add meta.namespace so that the rule is emitted correctly'

    def check_rule(self, ctx, rule):
        return ('namespace' not in rule.meta and
                not is_nursery_rule(rule) and
                'maec/malware-category' not in rule.meta and
                'lib' not in rule.meta)


class NamespaceDoesntMatchRulePath(Lint):
    name = 'file path doesn\'t match rule namespace'
    recommendation = 'Move rule to appropriate directory or update the namespace'

    def check_rule(self, ctx, rule):
        # let the other lints catch namespace issues
        if 'namespace' not in rule.meta:
            return False
        if is_nursery_rule(rule):
            return False
        if 'maec/malware-category' in rule.meta:
            return False
        if 'lib' in rule.meta:
            return False

        return rule.meta["namespace"] not in rule.meta['capa/path'].replace('\\', '/')


class MissingScope(Lint):
    name = 'missing scope'
    recommendation = 'Add meta.scope so that the scope is explicit (defaults to `function`)'

    def check_rule(self, ctx, rule):
        return 'scope' not in rule.meta


class InvalidScope(Lint):
    name = 'invalid scope'
    recommendation = 'Use only file, function, or basic block rule scopes'

    def check_rule(self, ctx, rule):
        return rule.meta.get('scope') not in ('file', 'function', 'basic block')


class MissingAuthor(Lint):
    name = 'missing author'
    recommendation = 'Add meta.author so that users know who to contact with questions'

    def check_rule(self, ctx, rule):
        return 'author' not in rule.meta


class MissingExamples(Lint):
    name = 'missing examples'
    recommendation = 'Add meta.examples so that the rule can be tested and verified'

    def check_rule(self, ctx, rule):
        return ('examples' not in rule.meta or
                not isinstance(rule.meta['examples'], list) or
                len(rule.meta['examples']) == 0 or
                rule.meta['examples'] == [None])


class MissingExampleOffset(Lint):
    name = 'missing example offset'
    recommendation = 'Add offset of example function'

    def check_rule(self, ctx, rule):
        if rule.meta.get('scope') in ('function', 'basic block'):
            for example in rule.meta.get('examples', []):
                if example and ':' not in example:
                    logger.debug('example: %s', example)
                    return True


class ExampleFileDNE(Lint):
    name = 'referenced example doesn\'t exist'
    recommendation = 'Add the referenced example to samples directory ($capa-root/tests/data or supplied via --samples)'

    def check_rule(self, ctx, rule):
        if not rule.meta.get('examples'):
            # let the MissingExamples lint catch this case, don't double report.
            return False

        found = False
        for example in rule.meta.get('examples', []):
            if example:
                example_id = example.partition(':')[0]
                if example_id in ctx['samples']:
                    found = True
                    break

        return not found


class DoesntMatchExample(Lint):
    name = 'doesn\'t match on referenced example'
    recommendation = 'Fix the rule logic or provide a different example'

    def check_rule(self, ctx, rule):
        if not ctx['is_thorough']:
            return False

        for example in rule.meta.get('examples', []):
            example_id = example.partition(':')[0]
            try:
                path = ctx['samples'][example_id]
            except KeyError:
                # lint ExampleFileDNE will catch this.
                # don't double report.
                continue

            try:
                extractor = capa.main.get_extractor(path, 'auto')
                capabilities = capa.main.find_capabilities(ctx['rules'], extractor, disable_progress=True)
            except Exception as e:
                logger.error('failed to extract capabilities: %s %s %s', rule.name, path, e)
                return True

            if rule.name not in capabilities:
                return True


class UnusualMetaField(Lint):
    name = 'unusual meta field'
    recommendation = 'Remove the unusual meta field'

    def check_rule(self, ctx, rule):
        for key in rule.meta.keys():
            if key in capa.rules.META_KEYS:
                continue
            if key in capa.rules.HIDDEN_META_KEYS:
                continue
            logger.debug("unusual meta field: %s", key)
            return True

        return False


class FeatureStringTooShort(Lint):
    name = 'feature string too short'
    recommendation = 'capa only extracts strings with length >= 4; will not match on "{:s}"'

    def check_features(self, ctx, features):
        for feature in features:
            if isinstance(feature, capa.features.String):
                if len(feature.value) < 4:
                    self.recommendation = self.recommendation.format(feature.value)
                    return True
        return False


def run_lints(lints, ctx, rule):
    for lint in lints:
        if lint.check_rule(ctx, rule):
            yield lint


def run_feature_lints(lints, ctx, features):
    for lint in lints:
        if lint.check_features(ctx, features):
            yield lint


NAME_LINTS = (
    NameCasing(),
    FilenameDoesntMatchRuleName(),
)


def lint_name(ctx, rule):
    return run_lints(NAME_LINTS, ctx, rule)


SCOPE_LINTS = (
    MissingScope(),
    InvalidScope(),
)


def lint_scope(ctx, rule):
    return run_lints(SCOPE_LINTS, ctx, rule)


META_LINTS = (
    MissingNamespace(),
    NamespaceDoesntMatchRulePath(),
    MissingAuthor(),
    MissingExamples(),
    MissingExampleOffset(),
    ExampleFileDNE(),
    UnusualMetaField(),
)


def lint_meta(ctx, rule):
    return run_lints(META_LINTS, ctx, rule)


FEATURE_LINTS = (
    FeatureStringTooShort(),
)


def lint_features(ctx, rule):
    features = get_features(ctx, rule)
    return run_feature_lints(FEATURE_LINTS, ctx, features)


def get_features(ctx, rule):
    # get features from rule and all dependencies including subscopes and matched rules
    features = []
    deps = [ctx['rules'].rules[dep] for dep in rule.get_dependencies()]
    for r in [rule] + deps:
        features.extend(get_rule_features(r))
    return features


def get_rule_features(rule):
    features = []

    def rec(statement):
        if isinstance(statement, capa.engine.Statement):
            for child in statement.get_children():
                rec(child)
        else:
            features.append(statement)

    rec(rule.statement)
    return features


LOGIC_LINTS = (
    DoesntMatchExample(),
)


def lint_logic(ctx, rule):
    return run_lints(LOGIC_LINTS, ctx, rule)


def is_nursery_rule(rule):
    '''
    The nursery is a spot for rules that have not yet been fully polished.
    For example, they may not have references to public example of a technique.
    Yet, we still want to capture and report on their matches.
    '''
    return rule.meta.get('capa/nursery')


def lint_rule(ctx, rule):
    logger.debug(rule.name)

    violations = list(itertools.chain(
        lint_name(ctx, rule),
        lint_scope(ctx, rule),
        lint_meta(ctx, rule),
        lint_logic(ctx, rule),
        lint_features(ctx, rule),
    ))

    if len(violations) > 0:
        category = rule.meta.get('rule-category')

        print('')
        print('%s%s %s' % ('    (nursery) ' if is_nursery_rule(rule) else '',
                           rule.name,
                           ('(%s)' % category) if category else ''))

        level = 'WARN' if is_nursery_rule(rule) else 'FAIL'

        for violation in violations:
            print('%s  %s: %s: %s' % (
                  '    ' if is_nursery_rule(rule) else '', level, violation.name, violation.recommendation))

    return len(violations) > 0 and not is_nursery_rule(rule)


def lint(ctx, rules):
    '''
    Args:
      samples (Dict[string, string]): map from sample id to path.
        for each sample, record sample id of sha256, md5, and filename.
        see `collect_samples(path)`.
      rules (List[Rule]): the rules to lint.
    '''
    did_suggest_fix = False
    for rule in rules.rules.values():
        if rule.meta.get('capa/subscope-rule', False):
            continue

        did_suggest_fix = lint_rule(ctx, rule) or did_suggest_fix

    return did_suggest_fix


def collect_samples(path):
    '''
    recurse through the given path, collecting all file paths, indexed by their content sha256, md5, and filename.
    '''
    samples = {}
    for root, dirs, files in os.walk(path):
        for name in files:
            if name.endswith('.viv'):
                continue
            if name.endswith('.idb'):
                continue
            if name.endswith('.i64'):
                continue

            path = os.path.join(root, name)

            try:
                with open(path, 'rb') as f:
                    buf = f.read()
            except IOError:
                continue

            sha256 = hashlib.sha256()
            sha256.update(buf)

            md5 = hashlib.md5()
            md5.update(buf)

            samples[sha256.hexdigest().lower()] = path
            samples[sha256.hexdigest().upper()] = path
            samples[md5.hexdigest().lower()] = path
            samples[md5.hexdigest().upper()] = path
            samples[name] = path

    return samples


def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]

    samples_path = os.path.join(os.path.dirname(__file__), '..', 'tests', 'data')

    parser = argparse.ArgumentParser(description='A program.')
    parser.add_argument('rules', type=str,
                        help='Path to rules')
    parser.add_argument('--samples', type=str, default=samples_path,
                        help='Path to samples')
    parser.add_argument('--thorough', action='store_true',
                        help='Enable thorough linting - takes more time, but does a better job')
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='Enable debug logging')
    parser.add_argument('-q', '--quiet', action='store_true',
                        help='Disable all output but errors')
    args = parser.parse_args(args=argv)

    if args.verbose:
        level = logging.DEBUG
    elif args.quiet:
        level = logging.ERROR
    else:
        level = logging.INFO

    logging.basicConfig(level=level)
    logging.getLogger('capa.lint').setLevel(level)

    capa.main.set_vivisect_log_level(logging.CRITICAL)
    logging.getLogger('capa').setLevel(logging.CRITICAL)

    try:
        rules = capa.main.get_rules(args.rules)
        rules = capa.rules.RuleSet(rules)
        logger.info('successfully loaded %s rules', len(rules))
    except IOError as e:
        logger.error('%s', str(e))
        return -1
    except capa.rules.InvalidRule as e:
        logger.error('%s', str(e))
        return -1

    logger.info('collecting potentially referenced samples')
    if not os.path.exists(args.samples):
        logger.error('samples path %s does not exist', args.samples)
        return -1

    samples = collect_samples(args.samples)

    ctx = {
        'samples': samples,
        'rules': rules,
        'is_thorough': args.thorough,
    }

    did_violate = lint(ctx, rules)
    if not did_violate:
        logger.info('no suggestions, nice!')
        return 0
    else:
        return 1


if __name__ == '__main__':
    sys.exit(main())