diff --git a/capa/engine.py b/capa/engine.py index 4be1e32c..9ce4397a 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -7,23 +7,24 @@ import capa.features class Statement(object): - ''' + """ superclass for structural nodes, such as and/or/not. this exists to provide a default impl for `__str__` and `__repr__`, and to declare the interface method `evaluate` - ''' + """ + def __init__(self): super(Statement, self).__init__() self.name = self.__class__.__name__ def __str__(self): - return '%s(%s)' % (self.name.lower(), ','.join(map(str, self.get_children()))) + return "%s(%s)" % (self.name.lower(), ",".join(map(str, self.get_children()))) def __repr__(self): return str(self) def evaluate(self, ctx): - ''' + """ classes that inherit `Statement` must implement `evaluate` args: @@ -31,30 +32,30 @@ class Statement(object): returns: Result - ''' + """ raise NotImplementedError() def get_children(self): - if hasattr(self, 'child'): + if hasattr(self, "child"): yield self.child - if hasattr(self, 'children'): + if hasattr(self, "children"): for child in self.children: yield child def replace_child(self, existing, new): - if hasattr(self, 'child'): + if hasattr(self, "child"): if self.child is existing: self.child = new - if hasattr(self, 'children'): + if hasattr(self, "children"): for i, child in enumerate(self.children): if child is existing: self.children[i] = new class Result(object): - ''' + """ represents the results of an evaluation of statements against features. instances of this class should behave like a bool, @@ -65,15 +66,16 @@ class Result(object): as well as the children Result instances. we need this so that we can render the tree of expressions and their results. - ''' + """ + def __init__(self, success, statement, children, locations=None): - ''' + """ args: success (bool) statement (capa.engine.Statement or capa.features.Feature) children (list[Result]) locations (iterable[VA]) - ''' + """ super(Result, self).__init__() self.success = success self.statement = statement @@ -93,7 +95,8 @@ class Result(object): class And(Statement): - '''match if all of the children evaluate to True.''' + """match if all of the children evaluate to True.""" + def __init__(self, *children): super(And, self).__init__() self.children = list(children) @@ -105,7 +108,8 @@ class And(Statement): class Or(Statement): - '''match if any of the children evaluate to True.''' + """match if any of the children evaluate to True.""" + def __init__(self, *children): super(Or, self).__init__() self.children = list(children) @@ -117,7 +121,8 @@ class Or(Statement): class Not(Statement): - '''match only if the child evaluates to False.''' + """match only if the child evaluates to False.""" + def __init__(self, child): super(Not, self).__init__() self.child = child @@ -129,7 +134,8 @@ class Not(Statement): class Some(Statement): - '''match if at least N of the children evaluate to True.''' + """match if at least N of the children evaluate to True.""" + def __init__(self, count, *children): super(Some, self).__init__() self.count = count @@ -146,7 +152,8 @@ class Some(Statement): class Range(Statement): - '''match if the child is contained in the ctx set with a count in the given range.''' + """match if the child is contained in the ctx set with a count in the given range.""" + def __init__(self, child, min=None, max=None): super(Range, self).__init__() self.child = child @@ -162,27 +169,28 @@ class Range(Statement): def __str__(self): if self.max == (1 << 64 - 1): - return 'range(%s, min=%d, max=infinity)' % (str(self.child), self.min) + return "range(%s, min=%d, max=infinity)" % (str(self.child), self.min) else: - return 'range(%s, min=%d, max=%d)' % (str(self.child), self.min, self.max) + return "range(%s, min=%d, max=%d)" % (str(self.child), self.min, self.max) class Regex(Statement): - '''match if the given pattern matches a String feature.''' + """match if the given pattern matches a String feature.""" + def __init__(self, pattern): super(Regex, self).__init__() self.pattern = pattern - pat = self.pattern[len('/'):-len('/')] + pat = self.pattern[len("/") : -len("/")] flags = re.DOTALL - if pattern.endswith('/i'): - pat = self.pattern[len('/'):-len('/i')] + if pattern.endswith("/i"): + pat = self.pattern[len("/") : -len("/i")] flags |= re.IGNORECASE self.re = re.compile(pat, flags) - self.match = '' + self.match = "" def evaluate(self, ctx): for feature, locations in ctx.items(): - if not isinstance(feature, (capa.features.String, )): + if not isinstance(feature, (capa.features.String,)): continue # `re.search` finds a match anywhere in the given string @@ -200,27 +208,28 @@ class Regex(Statement): class Subscope(Statement): - ''' + """ a subscope element is a placeholder in a rule - it should not be evaluated directly. the engine should preprocess rules to extract subscope statements into their own rules. - ''' + """ + def __init__(self, scope, child): super(Subscope, self).__init__() self.scope = scope self.child = child def evaluate(self, ctx): - raise ValueError('cannot evaluate a subscope directly!') + raise ValueError("cannot evaluate a subscope directly!") def topologically_order_rules(rules): - ''' + """ order the given rules such that dependencies show up before dependents. this means that as we match rules, we can add features for the matches, and these will be matched by subsequent rules if they follow this order. assumes that the rule dependency graph is a DAG. - ''' + """ # we evaluate `rules` multiple times, so if its a generator, realize it into a list. rules = list(rules) namespaces = capa.rules.index_rules_by_namespace(rules) @@ -245,7 +254,7 @@ def topologically_order_rules(rules): def match(rules, features, va): - ''' + """ Args: rules (List[capa.rules.Rule]): these must already be ordered topologically by dependency. features (Mapping[capa.features.Feature, int]): @@ -255,7 +264,7 @@ def match(rules, features, va): Tuple[List[capa.features.Feature], Dict[str, Tuple[int, capa.engine.Result]]]: two-tuple with entries: - list of features used for matching (which may be greater than argument, due to rule match features), and - mapping from rule name to (location of match, result object) - ''' + """ results = collections.defaultdict(list) # copy features so that we can modify it @@ -270,10 +279,10 @@ def match(rules, features, va): results[rule.name].append((va, res)) features[capa.features.MatchedRule(rule.name)].add(va) - namespace = rule.meta.get('namespace') + namespace = rule.meta.get("namespace") if namespace: while namespace: features[capa.features.MatchedRule(namespace)].add(va) - namespace, _, _ = namespace.rpartition('/') + namespace, _, _ = namespace.rpartition("/") return (features, results) diff --git a/capa/features/__init__.py b/capa/features/__init__.py index 9fec2d76..3e71481e 100644 --- a/capa/features/__init__.py +++ b/capa/features/__init__.py @@ -11,9 +11,9 @@ MAX_BYTES_FEATURE_SIZE = 0x100 def bytes_to_str(b): if sys.version_info[0] >= 3: - return str(codecs.encode(b, 'hex').decode('utf-8')) + return str(codecs.encode(b, "hex").decode("utf-8")) else: - return codecs.encode(b, 'hex') + return codecs.encode(b, "hex") class Feature(object): @@ -29,7 +29,7 @@ class Feature(object): return self.name == other.name and self.args == other.args def __str__(self): - return '%s(%s)' % (self.name.lower(), ','.join(self.args)) + return "%s(%s)" % (self.name.lower(), ",".join(self.args)) def __repr__(self): return str(self) @@ -41,8 +41,7 @@ class Feature(object): return self.__dict__ def freeze_serialize(self): - return (self.__class__.__name__, - self.args) + return (self.__class__.__name__, self.args) @classmethod def freeze_deserialize(cls, args): @@ -55,30 +54,30 @@ class MatchedRule(Feature): self.rule_name = rule_name def __str__(self): - return 'match(%s)' % (self.rule_name) + return "match(%s)" % (self.rule_name) class Characteristic(Feature): def __init__(self, name, value=None): - ''' + """ when `value` is not provided, this serves as descriptor for a class of characteristics. this is only used internally, such as in `rules.py` when checking if a statement is supported by a given scope. - ''' + """ super(Characteristic, self).__init__([name, value]) self.name = name self.value = value def evaluate(self, ctx): if self.value is None: - raise ValueError('cannot evaluate characteristc %s with empty value' % (str(self))) + raise ValueError("cannot evaluate characteristc %s with empty value" % (str(self))) return super(Characteristic, self).evaluate(ctx) def __str__(self): if self.value is None: - return 'characteristic(%s)' % (self.name) + return "characteristic(%s)" % (self.name) else: - return 'characteristic(%s(%s))' % (self.name, self.value) + return "characteristic(%s(%s))" % (self.name, self.value) class String(Feature): @@ -98,7 +97,7 @@ class Bytes(Feature): def evaluate(self, ctx): for feature, locations in ctx.items(): - if not isinstance(feature, (capa.features.Bytes, )): + if not isinstance(feature, (capa.features.Bytes,)): continue if feature.value.startswith(self.value): @@ -108,14 +107,13 @@ class Bytes(Feature): def __str__(self): if self.symbol: - return 'bytes(0x%s = %s)' % (bytes_to_str(self.value).upper(), self.symbol) + return "bytes(0x%s = %s)" % (bytes_to_str(self.value).upper(), self.symbol) else: - return 'bytes(0x%s)' % (bytes_to_str(self.value).upper()) + return "bytes(0x%s)" % (bytes_to_str(self.value).upper()) def freeze_serialize(self): - return (self.__class__.__name__, - [bytes_to_str(x).upper() for x in self.args]) + return (self.__class__.__name__, [bytes_to_str(x).upper() for x in self.args]) @classmethod def freeze_deserialize(cls, args): - return cls(*[codecs.decode(x, 'hex') for x in args]) + return cls(*[codecs.decode(x, "hex") for x in args]) diff --git a/capa/features/basicblock.py b/capa/features/basicblock.py index 3f49d881..c1c7234c 100644 --- a/capa/features/basicblock.py +++ b/capa/features/basicblock.py @@ -6,4 +6,4 @@ class BasicBlock(Feature): super(BasicBlock, self).__init__([]) def __str__(self): - return 'basic block' + return "basic block" diff --git a/capa/features/extractors/__init__.py b/capa/features/extractors/__init__.py index 0486a63a..eb0f0b10 100644 --- a/capa/features/extractors/__init__.py +++ b/capa/features/extractors/__init__.py @@ -10,11 +10,11 @@ try: except (ImportError, SyntaxError): pass -__all__ = ['ida', 'viv'] +__all__ = ["ida", "viv"] class FeatureExtractor(object): - ''' + """ FeatureExtractor defines the interface for fetching features from a sample. There may be multiple backends that support fetching features for capa. @@ -27,7 +27,8 @@ class FeatureExtractor(object): Also, this provides a way to hook in an IDA backend. This class is not instantiated directly; it is the base class for other implementations. - ''' + """ + __metaclass__ = abc.ABCMeta def __init__(self): @@ -40,7 +41,7 @@ class FeatureExtractor(object): @abc.abstractmethod def extract_file_features(self): - ''' + """ extract file-scope features. example:: @@ -51,12 +52,12 @@ class FeatureExtractor(object): yields: Tuple[capa.features.Feature, int]: feature and its location - ''' + """ raise NotImplemented @abc.abstractmethod def get_functions(self): - ''' + """ enumerate the functions and provide opaque values that will subsequently be provided to `.extract_function_features()`, etc. @@ -67,12 +68,12 @@ class FeatureExtractor(object): yields: any: the opaque function value. - ''' + """ raise NotImplemented @abc.abstractmethod def extract_function_features(self, f): - ''' + """ extract function-scope features. the arguments are opaque values previously provided by `.get_functions()`, etc. @@ -88,12 +89,12 @@ class FeatureExtractor(object): yields: Tuple[capa.features.Feature, int]: feature and its location - ''' + """ raise NotImplemented @abc.abstractmethod def get_basic_blocks(self, f): - ''' + """ enumerate the basic blocks in the given function and provide opaque values that will subsequently be provided to `.extract_basic_block_features()`, etc. @@ -104,12 +105,12 @@ class FeatureExtractor(object): yields: any: the opaque basic block value. - ''' + """ raise NotImplemented @abc.abstractmethod def extract_basic_block_features(self, f, bb): - ''' + """ extract basic block-scope features. the arguments are opaque values previously provided by `.get_functions()`, etc. @@ -127,12 +128,12 @@ class FeatureExtractor(object): yields: Tuple[capa.features.Feature, int]: feature and its location - ''' + """ raise NotImplemented @abc.abstractmethod def get_instructions(self, f, bb): - ''' + """ enumerate the instructions in the given basic block and provide opaque values that will subsequently be provided to `.extract_insn_features()`, etc. @@ -143,12 +144,12 @@ class FeatureExtractor(object): yields: any: the opaque function value. - ''' + """ raise NotImplemented @abc.abstractmethod def extract_insn_features(self, f, bb, insn): - ''' + """ extract instruction-scope features. the arguments are opaque values previously provided by `.get_functions()`, etc. @@ -168,12 +169,12 @@ class FeatureExtractor(object): yields: Tuple[capa.features.Feature, int]: feature and its location - ''' + """ raise NotImplemented class NullFeatureExtractor(FeatureExtractor): - ''' + """ An extractor that extracts some user-provided features. The structure of the single parameter is demonstrated in the example below. @@ -211,64 +212,66 @@ class NullFeatureExtractor(FeatureExtractor): 0x40200: ... } ) - ''' + """ + def __init__(self, features): super(NullFeatureExtractor, self).__init__() self.features = features def extract_file_features(self): - for p in self.features.get('file features', []): + for p in self.features.get("file features", []): va, feature = p yield feature, va def get_functions(self): - for va in sorted(self.features['functions'].keys()): + for va in sorted(self.features["functions"].keys()): yield va def extract_function_features(self, f): - for p in (self.features # noqa: E127 line over-indented - .get('functions', {}) - .get(f, {}) - .get('features', [])): + for p in self.features.get("functions", {}).get(f, {}).get("features", []): # noqa: E127 line over-indented va, feature = p yield feature, va def get_basic_blocks(self, f): - for va in sorted(self.features # noqa: E127 line over-indented - .get('functions', {}) - .get(f, {}) - .get('basic blocks', {}) - .keys()): + for va in sorted( + self.features.get("functions", {}) # noqa: E127 line over-indented + .get(f, {}) + .get("basic blocks", {}) + .keys() + ): yield va def extract_basic_block_features(self, f, bb): - for p in (self.features # noqa: E127 line over-indented - .get('functions', {}) - .get(f, {}) - .get('basic blocks', {}) - .get(bb, {}) - .get('features', [])): + for p in ( + self.features.get("functions", {}) # noqa: E127 line over-indented + .get(f, {}) + .get("basic blocks", {}) + .get(bb, {}) + .get("features", []) + ): va, feature = p yield feature, va def get_instructions(self, f, bb): - for va in sorted(self.features # noqa: E127 line over-indented - .get('functions', {}) - .get(f, {}) - .get('basic blocks', {}) - .get(bb, {}) - .get('instructions', {}) - .keys()): + for va in sorted( + self.features.get("functions", {}) # noqa: E127 line over-indented + .get(f, {}) + .get("basic blocks", {}) + .get(bb, {}) + .get("instructions", {}) + .keys() + ): yield va def extract_insn_features(self, f, bb, insn): - for p in (self.features # noqa: E127 line over-indented - .get('functions', {}) - .get(f, {}) - .get('basic blocks', {}) - .get(bb, {}) - .get('instructions', {}) - .get(insn, {}) - .get('features', [])): + for p in ( + self.features.get("functions", {}) # noqa: E127 line over-indented + .get(f, {}) + .get("basic blocks", {}) + .get(bb, {}) + .get("instructions", {}) + .get(insn, {}) + .get("features", []) + ): va, feature = p yield feature, va diff --git a/capa/features/extractors/helpers.py b/capa/features/extractors/helpers.py index 165221f7..e43133e0 100644 --- a/capa/features/extractors/helpers.py +++ b/capa/features/extractors/helpers.py @@ -10,27 +10,27 @@ def xor_static(data, i): if sys.version_info >= (3, 0): return bytes(c ^ i for c in data) else: - return ''.join(chr(ord(c) ^ i) for c in data) + return "".join(chr(ord(c) ^ i) for c in data) def is_aw_function(function_name): - ''' + """ is the given function name an A/W function? these are variants of functions that, on Windows, accept either a narrow or wide string. - ''' + """ if len(function_name) < 2: return False # last character should be 'A' or 'W' - if function_name[-1] not in ('A', 'W'): + if function_name[-1] not in ("A", "W"): return False # second to last character should be lowercase letter - return 'a' <= function_name[-2] <= 'z' or '0' <= function_name[-2] <= '9' + return "a" <= function_name[-2] <= "z" or "0" <= function_name[-2] <= "9" def generate_api_features(apiname, va): - ''' + """ for a given function name and address, generate API names. we over-generate features to make matching easier. these include: @@ -38,7 +38,7 @@ def generate_api_features(apiname, va): - kernel32.CreateFile - CreateFileA - CreateFile - ''' + """ # (kernel32.CreateFileA, 0x401000) yield API(apiname), va @@ -46,8 +46,8 @@ def generate_api_features(apiname, va): # (kernel32.CreateFile, 0x401000) yield API(apiname[:-1]), va - if '.' in apiname: - modname, impname = apiname.split('.') + if "." in apiname: + modname, impname = apiname.split(".") # strip modname to support importname-only matching # (CreateFileA, 0x401000) yield API(impname), va diff --git a/capa/features/extractors/ida/__init__.py b/capa/features/extractors/ida/__init__.py index a12df810..bd69a36f 100644 --- a/capa/features/extractors/ida/__init__.py +++ b/capa/features/extractors/ida/__init__.py @@ -26,17 +26,17 @@ def get_va(self): def add_va_int_cast(o): - ''' + """ dynamically add a cast-to-int (`__int__`) method to the given object that returns the value of the `.va` property. this bit of skullduggery lets use cast viv-utils objects as ints. the correct way of doing this is to update viv-utils (or subclass the objects here). - ''' + """ if sys.version_info >= (3, 0): - setattr(o, '__int__', types.MethodType(get_va, o)) + setattr(o, "__int__", types.MethodType(get_va, o)) else: - setattr(o, '__int__', types.MethodType(get_va, o, type(o))) + setattr(o, "__int__", types.MethodType(get_va, o, type(o))) return o diff --git a/capa/features/extractors/ida/basicblock.py b/capa/features/extractors/ida/basicblock.py index 51ba648a..41e67eb4 100644 --- a/capa/features/extractors/ida/basicblock.py +++ b/capa/features/extractors/ida/basicblock.py @@ -15,23 +15,23 @@ from capa.features.extractors.helpers import MIN_STACKSTRING_LEN def _ida_get_printable_len(op): - ''' Return string length if all operand bytes are ascii or utf16-le printable + """ Return string length if all operand bytes are ascii or utf16-le printable args: op (IDA op_t) - ''' + """ op_val = helpers.mask_op_val(op) if op.dtype == idaapi.dt_byte: - chars = struct.pack('= (3, 0): @@ -44,7 +44,7 @@ def _ida_get_printable_len(op): if all(c == 0x00 for c in chars[1::2]): return _is_printable_ascii(chars[::2]) else: - if all(c == '\x00' for c in chars[1::2]): + if all(c == "\x00" for c in chars[1::2]): return _is_printable_ascii(chars[::2]) if _is_printable_ascii(chars): @@ -57,32 +57,32 @@ def _ida_get_printable_len(op): def _is_mov_imm_to_stack(insn): - ''' verify instruction moves immediate onto stack + """ verify instruction moves immediate onto stack args: insn (IDA insn_t) - ''' + """ if insn.Op2.type != idaapi.o_imm: return False if not helpers.is_op_stack_var(insn.ea, 0): return False - if not insn.get_canon_mnem().startswith('mov'): + if not insn.get_canon_mnem().startswith("mov"): return False return True def _ida_bb_contains_stackstring(f, bb): - ''' check basic block for stackstring indicators + """ check basic block for stackstring indicators true if basic block contains enough moves of constant bytes to the stack args: f (IDA func_t) bb (IDA BasicBlock) - ''' + """ count = 0 for insn in helpers.get_instructions_in_range(bb.start_ea, bb.end_ea): @@ -96,25 +96,25 @@ def _ida_bb_contains_stackstring(f, bb): def extract_bb_stackstring(f, bb): - ''' extract stackstring indicators from basic block + """ extract stackstring indicators from basic block args: f (IDA func_t) bb (IDA BasicBlock) - ''' + """ if _ida_bb_contains_stackstring(f, bb): - yield Characteristic('stack string', True), bb.start_ea + yield Characteristic("stack string", True), bb.start_ea def _ida_bb_contains_tight_loop(f, bb): - ''' check basic block for stackstring indicators + """ check basic block for stackstring indicators true if last instruction in basic block branches to basic block start args: f (IDA func_t) bb (IDA BasicBlock) - ''' + """ bb_end = idc.prev_head(bb.end_ea) if bb.start_ea < bb_end: @@ -126,23 +126,23 @@ def _ida_bb_contains_tight_loop(f, bb): def extract_bb_tight_loop(f, bb): - ''' extract tight loop indicators from a basic block + """ extract tight loop indicators from a basic block args: f (IDA func_t) bb (IDA BasicBlock) - ''' + """ if _ida_bb_contains_tight_loop(f, bb): - yield Characteristic('tight loop', True), bb.start_ea + yield Characteristic("tight loop", True), bb.start_ea def extract_features(f, bb): - ''' extract basic block features + """ extract basic block features args: f (IDA func_t) bb (IDA BasicBlock) - ''' + """ yield BasicBlock(), bb.start_ea for bb_handler in BASIC_BLOCK_HANDLERS: @@ -166,5 +166,5 @@ def main(): pprint.pprint(features) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/capa/features/extractors/ida/file.py b/capa/features/extractors/ida/file.py index f75bf148..b38f5ffb 100644 --- a/capa/features/extractors/ida/file.py +++ b/capa/features/extractors/ida/file.py @@ -16,32 +16,39 @@ import capa.features.extractors.ida.helpers def _ida_check_segment_for_pe(seg): - ''' check segment for embedded PE + """ check segment for embedded PE adapted for IDA from: https://github.com/vivisect/vivisect/blob/7be4037b1cecc4551b397f840405a1fc606f9b53/PE/carve.py#L19 args: seg (IDA segment_t) - ''' + """ seg_max = seg.end_ea - mz_xor = [(capa.features.extractors.helpers.xor_static(b'MZ', i), - capa.features.extractors.helpers.xor_static(b'PE', i), - i) - for i in range(256)] - todo = [(capa.features.extractors.ida.helpers.find_byte_sequence(seg.start_ea, seg.end_ea, mzx), mzx, pex, i) for mzx, pex, i in mz_xor] + mz_xor = [ + ( + capa.features.extractors.helpers.xor_static(b"MZ", i), + capa.features.extractors.helpers.xor_static(b"PE", i), + i, + ) + for i in range(256) + ] + todo = [ + (capa.features.extractors.ida.helpers.find_byte_sequence(seg.start_ea, seg.end_ea, mzx), mzx, pex, i) + for mzx, pex, i in mz_xor + ] todo = [(off, mzx, pex, i) for (off, mzx, pex, i) in todo if off != idaapi.BADADDR] while len(todo): off, mzx, pex, i = todo.pop() # The MZ header has one field we will check e_lfanew is at 0x3c - e_lfanew = off + 0x3c + e_lfanew = off + 0x3C if seg_max < (e_lfanew + 4): continue - newoff = struct.unpack('= (3, 0): - return idaapi.find_binary(start, end, ' '.join(['%02x' % b for b in seq]), 0, idaapi.SEARCH_DOWN) + return idaapi.find_binary(start, end, " ".join(["%02x" % b for b in seq]), 0, idaapi.SEARCH_DOWN) else: - return idaapi.find_binary(start, end, ' '.join(['%02x' % ord(b) for b in seq]), 0, idaapi.SEARCH_DOWN) + return idaapi.find_binary(start, end, " ".join(["%02x" % ord(b) for b in seq]), 0, idaapi.SEARCH_DOWN) def get_functions(start=None, end=None, ignore_thunks=False, ignore_libs=False): - ''' get functions, range optional + """ get functions, range optional args: start: min virtual address @@ -29,7 +29,7 @@ def get_functions(start=None, end=None, ignore_thunks=False, ignore_libs=False): ret: yield func_t* - ''' + """ for ea in idautils.Functions(start=start, end=end): f = idaapi.get_func(ea) @@ -43,7 +43,7 @@ def get_functions(start=None, end=None, ignore_thunks=False, ignore_libs=False): def get_segments(): - ''' Get list of segments (sections) in the binary image ''' + """ Get list of segments (sections) in the binary image """ for n in range(idaapi.get_segm_qty()): seg = idaapi.getnseg(n) if seg: @@ -51,11 +51,11 @@ def get_segments(): def get_segment_buffer(seg): - ''' return bytes stored in a given segment + """ return bytes stored in a given segment decrease buffer size until IDA is able to read bytes from the segment - ''' - buff = b'' + """ + buff = b"" sz = seg.end_ea - seg.start_ea while sz > 0: @@ -65,11 +65,11 @@ def get_segment_buffer(seg): sz -= 0x1000 # IDA returns None if get_bytes fails, so convert for consistent return type - return buff if buff else b'' + return buff if buff else b"" def get_file_imports(): - ''' get file imports ''' + """ get file imports """ _imports = {} for idx in range(idaapi.get_import_module_qty()): @@ -79,9 +79,9 @@ def get_file_imports(): continue def _inspect_import(ea, name, ordi): - if name and name.startswith('__imp_'): + if name and name.startswith("__imp_"): # handle mangled names starting - name = name[len('__imp_'):] + name = name[len("__imp_") :] _imports[ea] = (dllname.lower(), name, ordi) return True @@ -91,14 +91,14 @@ def get_file_imports(): def get_instructions_in_range(start, end): - ''' yield instructions in range + """ yield instructions in range args: start: virtual address (inclusive) end: virtual address (exclusive) yield: (insn_t*) - ''' + """ for head in idautils.Heads(start, end): inst = idautils.DecodeInstruction(head) if inst: @@ -106,7 +106,7 @@ def get_instructions_in_range(start, end): def is_operand_equal(op1, op2): - ''' compare two IDA op_t ''' + """ compare two IDA op_t """ if op1.flags != op2.flags: return False @@ -132,14 +132,12 @@ def is_operand_equal(op1, op2): def is_basic_block_equal(bb1, bb2): - ''' compare two IDA BasicBlock ''' - return bb1.start_ea == bb2.start_ea \ - and bb1.end_ea == bb2.end_ea \ - and bb1.type == bb2.type + """ compare two IDA BasicBlock """ + return bb1.start_ea == bb2.start_ea and bb1.end_ea == bb2.end_ea and bb1.type == bb2.type def basic_block_size(bb): - ''' calculate size of basic block ''' + """ calculate size of basic block """ return bb.end_ea - bb.start_ea @@ -152,11 +150,11 @@ def read_bytes_at(ea, count): def find_string_at(ea, min=4): - ''' check if ASCII string exists at a given virtual address ''' + """ check if ASCII string exists at a given virtual address """ found = idaapi.get_strlit_contents(ea, -1, idaapi.STRTYPE_C) if found and len(found) > min: try: - found = found.decode('ascii') + found = found.decode("ascii") # hacky check for IDA bug; get_strlit_contents also reads Unicode as # myy__uunniiccoodde when searching in ASCII mode so we check for that here # and return the fixed up value @@ -169,11 +167,11 @@ def find_string_at(ea, min=4): def get_op_phrase_info(op): - ''' parse phrase features from operand + """ parse phrase features from operand Pretty much dup of sark's implementation: https://github.com/tmr232/Sark/blob/master/sark/code/instruction.py#L28-L73 - ''' + """ if op.type not in (idaapi.o_phrase, idaapi.o_displ): return @@ -202,21 +200,21 @@ def get_op_phrase_info(op): # This is only relevant to Intel architectures. index = None - return {'base': base, 'index': index, 'scale': scale, 'offset': offset} + return {"base": base, "index": index, "scale": scale, "offset": offset} def is_op_write(insn, op): - ''' Check if an operand is written to (destination operand) ''' + """ Check if an operand is written to (destination operand) """ return idaapi.has_cf_chg(insn.get_canon_feature(), op.n) def is_op_read(insn, op): - ''' Check if an operand is read from (source operand) ''' + """ Check if an operand is read from (source operand) """ return idaapi.has_cf_use(insn.get_canon_feature(), op.n) def is_sp_modified(insn): - ''' determine if instruction modifies SP, ESP, RSP ''' + """ determine if instruction modifies SP, ESP, RSP """ for op in get_insn_ops(insn, op_type=(idaapi.o_reg,)): if op.reg != idautils.procregs.sp.reg: continue @@ -228,7 +226,7 @@ def is_sp_modified(insn): def is_bp_modified(insn): - ''' check if instruction modifies BP, EBP, RBP ''' + """ check if instruction modifies BP, EBP, RBP """ for op in get_insn_ops(insn, op_type=(idaapi.o_reg,)): if op.reg != idautils.procregs.bp.reg: continue @@ -240,12 +238,12 @@ def is_bp_modified(insn): def is_frame_register(reg): - ''' check if register is sp or bp ''' + """ check if register is sp or bp """ return reg in (idautils.procregs.sp.reg, idautils.procregs.bp.reg) def get_insn_ops(insn, op_type=None): - ''' yield op_t for instruction, filter on type if specified ''' + """ yield op_t for instruction, filter on type if specified """ for op in insn.ops: if op.type == idaapi.o_void: # avoid looping all 6 ops if only subset exists @@ -258,17 +256,17 @@ def get_insn_ops(insn, op_type=None): def ea_flags(ea): - ''' retrieve processor flags for a given address ''' + """ retrieve processor flags for a given address """ return idaapi.get_flags(ea) def is_op_stack_var(ea, n): - ''' check if operand is a stack variable ''' + """ check if operand is a stack variable """ return idaapi.is_stkvar(ea_flags(ea), n) def mask_op_val(op): - ''' mask off a value based on data type + """ mask off a value based on data type necesssary due to a bug in 64-bit @@ -277,22 +275,22 @@ def mask_op_val(op): insn.Op2.dtype == idaapi.dt_dword insn.Op2.value == 0xffffffffffffffff - ''' + """ masks = { idaapi.dt_byte: 0xFF, idaapi.dt_word: 0xFFFF, idaapi.dt_dword: 0xFFFFFFFF, - idaapi.dt_qword: 0xFFFFFFFFFFFFFFFF + idaapi.dt_qword: 0xFFFFFFFFFFFFFFFF, } mask = masks.get(op.dtype, None) if not mask: - raise ValueError('No support for operand data type 0x%x' % op.dtype) + raise ValueError("No support for operand data type 0x%x" % op.dtype) return mask & op.value def ea_to_offset(ea): - ''' convert virtual address to file offset ''' + """ convert virtual address to file offset """ return idaapi.get_fileregion_offset(ea) diff --git a/capa/features/extractors/ida/insn.py b/capa/features/extractors/ida/insn.py index 3526d67d..ff8bdf41 100644 --- a/capa/features/extractors/ida/insn.py +++ b/capa/features/extractors/ida/insn.py @@ -26,7 +26,7 @@ def get_imports(): def _check_for_api_call(insn): - ''' check instruction for API call ''' + """ check instruction for API call """ if not idaapi.is_call_insn(insn): return @@ -34,7 +34,7 @@ def _check_for_api_call(insn): imp = get_imports().get(call_ref, None) if imp: - yield '%s.%s' % (imp[0], imp[1]) + yield "%s.%s" % (imp[0], imp[1]) else: f = idaapi.get_func(call_ref) @@ -46,11 +46,11 @@ def _check_for_api_call(insn): imp = get_imports().get(thunk_ref, None) if imp: - yield '%s.%s' % (imp[0], imp[1]) + yield "%s.%s" % (imp[0], imp[1]) def extract_insn_api_features(f, bb, insn): - ''' parse instruction API features + """ parse instruction API features args: f (IDA func_t) @@ -59,14 +59,14 @@ def extract_insn_api_features(f, bb, insn): example: call dword [0x00473038] - ''' + """ for api_name in _check_for_api_call(insn): for feature, va in capa.features.extractors.helpers.generate_api_features(api_name, insn.ea): yield feature, va def extract_insn_number_features(f, bb, insn): - ''' parse instruction number features + """ parse instruction number features args: f (IDA func_t) @@ -75,7 +75,7 @@ def extract_insn_number_features(f, bb, insn): example: push 3136B0h ; dwControlCode - ''' + """ if idaapi.is_ret_insn(insn): # skip things like: # .text:0042250E retn 8 @@ -97,7 +97,7 @@ def extract_insn_number_features(f, bb, insn): def extract_insn_bytes_features(f, bb, insn): - ''' parse referenced byte sequences + """ parse referenced byte sequences args: f (IDA func_t) @@ -106,7 +106,7 @@ def extract_insn_bytes_features(f, bb, insn): example: push offset iid_004118d4_IShellLinkA ; riid - ''' + """ if idaapi.is_call_insn(insn): # ignore call instructions return @@ -119,7 +119,7 @@ def extract_insn_bytes_features(f, bb, insn): def extract_insn_string_features(f, bb, insn): - ''' parse instruction string features + """ parse instruction string features args: f (IDA func_t) @@ -128,7 +128,7 @@ def extract_insn_string_features(f, bb, insn): example: push offset aAcr ; "ACR > " - ''' + """ for ref in idautils.DataRefsFrom(insn.ea): found = capa.features.extractors.ida.helpers.find_string_at(ref) if found: @@ -136,7 +136,7 @@ def extract_insn_string_features(f, bb, insn): def extract_insn_offset_features(f, bb, insn): - ''' parse instruction structure offset features + """ parse instruction structure offset features args: f (IDA func_t) @@ -145,7 +145,7 @@ def extract_insn_offset_features(f, bb, insn): example: .text:0040112F cmp [esi+4], ebx - ''' + """ for op in capa.features.extractors.ida.helpers.get_insn_ops(insn, op_type=(idaapi.o_phrase, idaapi.o_displ)): if capa.features.extractors.ida.helpers.is_op_stack_var(insn.ea, op.n): # skip stack offsets @@ -156,7 +156,7 @@ def extract_insn_offset_features(f, bb, insn): if not p_info: continue - op_off = p_info['offset'] + op_off = p_info["offset"] if 0 == op_off: # TODO: Do we want to record offset of zero? @@ -172,26 +172,26 @@ def extract_insn_offset_features(f, bb, insn): def _contains_stack_cookie_keywords(s): - ''' check if string contains stack cookie keywords + """ check if string contains stack cookie keywords Examples: xor ecx, ebp ; StackCookie mov eax, ___security_cookie - ''' + """ if not s: return False s = s.strip().lower() - if 'cookie' not in s: + if "cookie" not in s: return False - return any(keyword in s for keyword in ('stack', 'security')) + return any(keyword in s for keyword in ("stack", "security")) def _bb_stack_cookie_registers(bb): - ''' scan basic block for stack cookie operations + """ scan basic block for stack cookie operations yield registers ids that may have been used for stack cookie operations @@ -215,7 +215,7 @@ def _bb_stack_cookie_registers(bb): .text:004062FA jnz loc_40639D TODO: this is expensive, but necessary?... - ''' + """ for insn in capa.features.extractors.ida.helpers.get_instructions_in_range(bb.start_ea, bb.end_ea): if _contains_stack_cookie_keywords(idc.GetDisasm(insn.ea)): for op in capa.features.extractors.ida.helpers.get_insn_ops(insn, op_type=(idaapi.o_reg,)): @@ -225,7 +225,7 @@ def _bb_stack_cookie_registers(bb): def _is_nzxor_stack_cookie(f, bb, insn): - ''' check if nzxor is related to stack cookie ''' + """ check if nzxor is related to stack cookie """ if _contains_stack_cookie_keywords(idaapi.get_cmt(insn.ea, False)): # Example: # xor ecx, ebp ; StackCookie @@ -241,7 +241,7 @@ def _is_nzxor_stack_cookie(f, bb, insn): def extract_insn_nzxor_characteristic_features(f, bb, insn): - ''' parse instruction non-zeroing XOR instruction + """ parse instruction non-zeroing XOR instruction ignore expected non-zeroing XORs, e.g. security cookies @@ -249,7 +249,7 @@ def extract_insn_nzxor_characteristic_features(f, bb, insn): f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) - ''' + """ if insn.itype != idaapi.NN_xor: return @@ -259,28 +259,28 @@ def extract_insn_nzxor_characteristic_features(f, bb, insn): if _is_nzxor_stack_cookie(f, bb, insn): return - yield Characteristic('nzxor', True), insn.ea + yield Characteristic("nzxor", True), insn.ea def extract_insn_mnemonic_features(f, bb, insn): - ''' parse instruction mnemonic features + """ parse instruction mnemonic features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) - ''' + """ yield Mnemonic(insn.get_canon_mnem()), insn.ea def extract_insn_peb_access_characteristic_features(f, bb, insn): - ''' parse instruction peb access + """ parse instruction peb access fs:[0x30] on x86, gs:[0x60] on x64 TODO: IDA should be able to do this.. - ''' + """ if insn.itype not in (idaapi.NN_push, idaapi.NN_mov): return @@ -290,40 +290,40 @@ def extract_insn_peb_access_characteristic_features(f, bb, insn): disasm = idc.GetDisasm(insn.ea) - if ' fs:30h' in disasm or ' gs:60h' in disasm: + if " fs:30h" in disasm or " gs:60h" in disasm: # TODO: replace above with proper IDA - yield Characteristic('peb access', True), insn.ea + yield Characteristic("peb access", True), insn.ea def extract_insn_segment_access_features(f, bb, insn): - ''' parse instruction fs or gs access + """ parse instruction fs or gs access TODO: IDA should be able to do this... - ''' + """ if all(map(lambda op: op.type != idaapi.o_mem, insn.ops)): # try to optimize for only memory referencese return disasm = idc.GetDisasm(insn.ea) - if ' fs:' in disasm: + if " fs:" in disasm: # TODO: replace above with proper IDA - yield Characteristic('fs access', True), insn.ea + yield Characteristic("fs access", True), insn.ea - if ' gs:' in disasm: + if " gs:" in disasm: # TODO: replace above with proper IDA - yield Characteristic('gs access', True), insn.ea + yield Characteristic("gs access", True), insn.ea def extract_insn_cross_section_cflow(f, bb, insn): - ''' inspect the instruction for a CALL or JMP that crosses section boundaries + """ inspect the instruction for a CALL or JMP that crosses section boundaries args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) - ''' + """ for ref in idautils.CodeRefsFrom(insn.ea, False): if ref in get_imports().keys(): # ignore API calls @@ -336,11 +336,11 @@ def extract_insn_cross_section_cflow(f, bb, insn): if idaapi.getseg(ref) == idaapi.getseg(insn.ea): continue - yield Characteristic('cross section flow', True), insn.ea + yield Characteristic("cross section flow", True), insn.ea def extract_function_calls_from(f, bb, insn): - ''' extract functions calls from features + """ extract functions calls from features most relevant at the function scope, however, its most efficient to extract at the instruction scope @@ -348,17 +348,17 @@ def extract_function_calls_from(f, bb, insn): f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) - ''' + """ if not idaapi.is_call_insn(insn): # ignore jmp, etc. return for ref in idautils.CodeRefsFrom(insn.ea, False): - yield Characteristic('calls from', True), ref + yield Characteristic("calls from", True), ref def extract_function_indirect_call_characteristic_features(f, bb, insn): - ''' extract indirect function calls (e.g., call eax or call dword ptr [edx+4]) + """ extract indirect function calls (e.g., call eax or call dword ptr [edx+4]) does not include calls like => call ds:dword_ABD4974 most relevant at the function or basic block scope; @@ -368,22 +368,22 @@ def extract_function_indirect_call_characteristic_features(f, bb, insn): f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) - ''' + """ if not idaapi.is_call_insn(insn): return if idc.get_operand_type(insn.ea, 0) in (idc.o_reg, idc.o_phrase, idc.o_displ): - yield Characteristic('indirect call', True), insn.ea + yield Characteristic("indirect call", True), insn.ea def extract_features(f, bb, insn): - ''' extract instruction features + """ extract instruction features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) - ''' + """ for inst_handler in INSTRUCTION_HANDLERS: for feature, va in inst_handler(f, bb, insn): yield feature, va @@ -401,7 +401,7 @@ INSTRUCTION_HANDLERS = ( extract_insn_cross_section_cflow, extract_insn_segment_access_features, extract_function_calls_from, - extract_function_indirect_call_characteristic_features + extract_function_indirect_call_characteristic_features, ) @@ -416,5 +416,5 @@ def main(): pprint.pprint(features) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/capa/features/extractors/loops.py b/capa/features/extractors/loops.py index 1fe914ff..db156376 100644 --- a/capa/features/extractors/loops.py +++ b/capa/features/extractors/loops.py @@ -3,7 +3,7 @@ from networkx import nx def has_loop(edges, threshold=2): - ''' check if a list of edges representing a directed graph contains a loop + """ check if a list of edges representing a directed graph contains a loop args: edges: list of edge sets representing a directed graph i.e. [(1, 2), (2, 1)] @@ -11,7 +11,7 @@ def has_loop(edges, threshold=2): returns: bool - ''' + """ g = nx.DiGraph() g.add_edges_from(edges) return any(len(comp) >= threshold for comp in strongly_connected_components(g)) diff --git a/capa/features/extractors/strings.py b/capa/features/extractors/strings.py index de25b17c..3826ad61 100644 --- a/capa/features/extractors/strings.py +++ b/capa/features/extractors/strings.py @@ -7,26 +7,28 @@ import re from collections import namedtuple -ASCII_BYTE = r' !\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t'.encode('ascii') -ASCII_RE_4 = re.compile(b'([%s]{%d,})' % (ASCII_BYTE, 4)) -UNICODE_RE_4 = re.compile(b'((?:[%s]\x00){%d,})' % (ASCII_BYTE, 4)) -REPEATS = [b'A', b'\x00', b'\xfe', b'\xff'] +ASCII_BYTE = r" !\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t".encode( + "ascii" +) +ASCII_RE_4 = re.compile(b"([%s]{%d,})" % (ASCII_BYTE, 4)) +UNICODE_RE_4 = re.compile(b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, 4)) +REPEATS = [b"A", b"\x00", b"\xfe", b"\xff"] SLICE_SIZE = 4096 -String = namedtuple('String', ['s', 'offset']) +String = namedtuple("String", ["s", "offset"]) def buf_filled_with(buf, character): dupe_chunk = character * SLICE_SIZE for offset in range(0, len(buf), SLICE_SIZE): - new_chunk = buf[offset: offset + SLICE_SIZE] - if dupe_chunk[:len(new_chunk)] != new_chunk: + new_chunk = buf[offset : offset + SLICE_SIZE] + if dupe_chunk[: len(new_chunk)] != new_chunk: return False return True def extract_ascii_strings(buf, n=4): - ''' + """ Extract ASCII strings from the given binary data. :param buf: A bytestring. @@ -34,7 +36,7 @@ def extract_ascii_strings(buf, n=4): :param n: The minimum length of strings to extract. :type n: int :rtype: Sequence[String] - ''' + """ if not buf: return @@ -46,14 +48,14 @@ def extract_ascii_strings(buf, n=4): if n == 4: r = ASCII_RE_4 else: - reg = b'([%s]{%d,})' % (ASCII_BYTE, n) + reg = b"([%s]{%d,})" % (ASCII_BYTE, n) r = re.compile(reg) for match in r.finditer(buf): - yield String(match.group().decode('ascii'), match.start()) + yield String(match.group().decode("ascii"), match.start()) def extract_unicode_strings(buf, n=4): - ''' + """ Extract naive UTF-16 strings from the given binary data. :param buf: A bytestring. @@ -61,7 +63,7 @@ def extract_unicode_strings(buf, n=4): :param n: The minimum length of strings to extract. :type n: int :rtype: Sequence[String] - ''' + """ if not buf: return @@ -72,11 +74,11 @@ def extract_unicode_strings(buf, n=4): if n == 4: r = UNICODE_RE_4 else: - reg = b'((?:[%s]\x00){%d,})' % (ASCII_BYTE, n) + reg = b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, n) r = re.compile(reg) for match in r.finditer(buf): try: - yield String(match.group().decode('utf-16'), match.start()) + yield String(match.group().decode("utf-16"), match.start()) except UnicodeDecodeError: pass @@ -84,15 +86,15 @@ def extract_unicode_strings(buf, n=4): def main(): import sys - with open(sys.argv[1], 'rb') as f: + with open(sys.argv[1], "rb") as f: b = f.read() for s in extract_ascii_strings(b): - print('0x{:x}: {:s}'.format(s.offset, s.s)) + print("0x{:x}: {:s}".format(s.offset, s.s)) for s in extract_unicode_strings(b): - print('0x{:x}: {:s}'.format(s.offset, s.s)) + print("0x{:x}: {:s}".format(s.offset, s.s)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/capa/features/extractors/viv/__init__.py b/capa/features/extractors/viv/__init__.py index eb503ce0..c5b0366b 100644 --- a/capa/features/extractors/viv/__init__.py +++ b/capa/features/extractors/viv/__init__.py @@ -13,7 +13,8 @@ import file import function import basicblock import insn -__all__ = ['file', 'function', 'basicblock', 'insn'] + +__all__ = ["file", "function", "basicblock", "insn"] def get_va(self): @@ -27,14 +28,14 @@ def get_va(self): def add_va_int_cast(o): - ''' + """ dynamically add a cast-to-int (`__int__`) method to the given object that returns the value of the `.va` property. this bit of skullduggery lets use cast viv-utils objects as ints. the correct way of doing this is to update viv-utils (or subclass the objects here). - ''' - setattr(o, '__int__', types.MethodType(get_va, o, type(o))) + """ + setattr(o, "__int__", types.MethodType(get_va, o, type(o))) return o diff --git a/capa/features/extractors/viv/basicblock.py b/capa/features/extractors/viv/basicblock.py index a7a6ef5c..7847c6d4 100644 --- a/capa/features/extractors/viv/basicblock.py +++ b/capa/features/extractors/viv/basicblock.py @@ -10,7 +10,7 @@ from capa.features.extractors.helpers import MIN_STACKSTRING_LEN def interface_extract_basic_block_XXX(f, bb): - ''' + """ parse features from the given basic block. args: @@ -19,14 +19,14 @@ def interface_extract_basic_block_XXX(f, bb): yields: (Feature, int): the feature and the address at which its found. - ''' - yield NotImplementedError('feature'), NotImplementedError('virtual address') + """ + yield NotImplementedError("feature"), NotImplementedError("virtual address") def _bb_has_tight_loop(f, bb): - ''' + """ parse tight loops, true if last instruction in basic block branches to bb start - ''' + """ if len(bb.instructions) > 0: for bva, bflags in bb.instructions[-1].getBranches(): if bflags & vivisect.envi.BR_COND: @@ -37,16 +37,16 @@ def _bb_has_tight_loop(f, bb): def extract_bb_tight_loop(f, bb): - ''' check basic block for tight loop indicators ''' + """ check basic block for tight loop indicators """ if _bb_has_tight_loop(f, bb): - yield Characteristic('tight loop', True), bb.va + yield Characteristic("tight loop", True), bb.va def _bb_has_stackstring(f, bb): - ''' + """ extract potential stackstring creation, using the following heuristics: - basic block contains enough moves of constant bytes to the stack - ''' + """ count = 0 for instr in bb.instructions: if is_mov_imm_to_stack(instr): @@ -60,16 +60,16 @@ def _bb_has_stackstring(f, bb): def extract_stackstring(f, bb): - ''' check basic block for stackstring indicators ''' + """ check basic block for stackstring indicators """ if _bb_has_stackstring(f, bb): - yield Characteristic('stack string', True), bb.va + yield Characteristic("stack string", True), bb.va def is_mov_imm_to_stack(instr): - ''' + """ Return if instruction moves immediate onto stack - ''' - if not instr.mnem.startswith('mov'): + """ + if not instr.mnem.startswith("mov"): return False try: @@ -82,32 +82,33 @@ def is_mov_imm_to_stack(instr): return False # TODO what about 64-bit operands? - if not isinstance(dst, envi.archs.i386.disasm.i386SibOper) and \ - not isinstance(dst, envi.archs.i386.disasm.i386RegMemOper): + if not isinstance(dst, envi.archs.i386.disasm.i386SibOper) and not isinstance( + dst, envi.archs.i386.disasm.i386RegMemOper + ): return False if not dst.reg: return False rname = dst._dis_regctx.getRegisterName(dst.reg) - if rname not in ['ebp', 'rbp', 'esp', 'rsp']: + if rname not in ["ebp", "rbp", "esp", "rsp"]: return False return True def get_printable_len(oper): - ''' + """ Return string length if all operand bytes are ascii or utf16-le printable - ''' + """ if oper.tsize == 1: - chars = struct.pack(' 0: - return vw.readMemory(offset, alen).decode('utf-8') + return vw.readMemory(offset, alen).decode("utf-8") try: ulen = vw.detectUnicode(offset) @@ -199,13 +197,13 @@ def read_string(vw, offset): # vivisect seems to mis-detect the end unicode strings # off by one, too short ulen += 1 - return vw.readMemory(offset, ulen).decode('utf-16') + return vw.readMemory(offset, ulen).decode("utf-16") - raise ValueError('not a string', offset) + raise ValueError("not a string", offset) def extract_insn_string_features(f, bb, insn): - '''parse string features from the given instruction.''' + """parse string features from the given instruction.""" # example: # # push offset aAcr ; "ACR > " @@ -222,11 +220,11 @@ def extract_insn_string_features(f, bb, insn): except ValueError: continue else: - yield String(s.rstrip('\x00')), insn.va + yield String(s.rstrip("\x00")), insn.va def extract_insn_offset_features(f, bb, insn): - '''parse structure offset features from the given instruction.''' + """parse structure offset features from the given instruction.""" # example: # # .text:0040112F cmp [esi+4], ebx @@ -249,15 +247,18 @@ def extract_insn_offset_features(f, bb, insn): def is_security_cookie(f, bb, insn): - ''' + """ check if an instruction is related to security cookie checks - ''' + """ # security cookie check should use SP or BP oper = insn.opers[1] - if oper.isReg() \ - and oper.reg not in [envi.archs.i386.disasm.REG_ESP, envi.archs.i386.disasm.REG_EBP, - # TODO: do x64 support for real. - envi.archs.amd64.disasm.REG_RBP, envi.archs.amd64.disasm.REG_RSP]: + if oper.isReg() and oper.reg not in [ + envi.archs.i386.disasm.REG_ESP, + envi.archs.i386.disasm.REG_EBP, + # TODO: do x64 support for real. + envi.archs.amd64.disasm.REG_RBP, + envi.archs.amd64.disasm.REG_RSP, + ]: return False # expect security cookie init in first basic block within first bytes (instructions) @@ -273,11 +274,11 @@ def is_security_cookie(f, bb, insn): def extract_insn_nzxor_characteristic_features(f, bb, insn): - ''' + """ parse non-zeroing XOR instruction from the given instruction. ignore expected non-zeroing XORs, e.g. security cookies. - ''' - if insn.mnem != 'xor': + """ + if insn.mnem != "xor": return if insn.opers[0] == insn.opers[1]: @@ -286,24 +287,24 @@ def extract_insn_nzxor_characteristic_features(f, bb, insn): if is_security_cookie(f, bb, insn): return - yield Characteristic('nzxor', True), insn.va + yield Characteristic("nzxor", True), insn.va def extract_insn_mnemonic_features(f, bb, insn): - '''parse mnemonic features from the given instruction.''' + """parse mnemonic features from the given instruction.""" yield Mnemonic(insn.mnem), insn.va def extract_insn_peb_access_characteristic_features(f, bb, insn): - ''' + """ parse peb access from the given function. fs:[0x30] on x86, gs:[0x60] on x64 - ''' + """ # TODO handle where fs/gs are loaded into a register or onto the stack and used later - if insn.mnem not in ['push', 'mov']: + if insn.mnem not in ["push", "mov"]: return - if 'fs' in insn.getPrefixName(): + if "fs" in insn.getPrefixName(): for oper in insn.opers: # examples # @@ -312,27 +313,29 @@ def extract_insn_peb_access_characteristic_features(f, bb, insn): # IDA: push large dword ptr fs:30h # viv: fs: push dword [0x00000030] # fs: push dword [eax + 0x30] ; i386RegMemOper, with eax = 0 - if (isinstance(oper, envi.archs.i386.disasm.i386RegMemOper) and oper.disp == 0x30) or \ - (isinstance(oper, envi.archs.i386.disasm.i386ImmMemOper) and oper.imm == 0x30): - yield Characteristic('peb access', True), insn.va - elif 'gs' in insn.getPrefixName(): + if (isinstance(oper, envi.archs.i386.disasm.i386RegMemOper) and oper.disp == 0x30) or ( + isinstance(oper, envi.archs.i386.disasm.i386ImmMemOper) and oper.imm == 0x30 + ): + yield Characteristic("peb access", True), insn.va + elif "gs" in insn.getPrefixName(): for oper in insn.opers: - if (isinstance(oper, envi.archs.amd64.disasm.i386RegMemOper) and oper.disp == 0x60) or \ - (isinstance(oper, envi.archs.amd64.disasm.i386ImmMemOper) and oper.imm == 0x60): - yield Characteristic('peb access', True), insn.va + if (isinstance(oper, envi.archs.amd64.disasm.i386RegMemOper) and oper.disp == 0x60) or ( + isinstance(oper, envi.archs.amd64.disasm.i386ImmMemOper) and oper.imm == 0x60 + ): + yield Characteristic("peb access", True), insn.va else: pass def extract_insn_segment_access_features(f, bb, insn): - ''' parse the instruction for access to fs or gs ''' + """ parse the instruction for access to fs or gs """ prefix = insn.getPrefixName() - if prefix == 'fs': - yield Characteristic('fs access', True), insn.va + if prefix == "fs": + yield Characteristic("fs access", True), insn.va - if prefix == 'gs': - yield Characteristic('gs access', True), insn.va + if prefix == "gs": + yield Characteristic("gs access", True), insn.va def get_section(vw, va): @@ -344,16 +347,16 @@ def get_section(vw, va): def extract_insn_cross_section_cflow(f, bb, insn): - ''' + """ inspect the instruction for a CALL or JMP that crosses section boundaries. - ''' + """ for va, flags in insn.getBranches(): if flags & envi.BR_FALL: continue try: # skip 32-bit calls to imports - if insn.mnem == 'call' and isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper): + if insn.mnem == "call" and isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper): oper = insn.opers[0] target = oper.getOperAddr(insn) @@ -361,7 +364,7 @@ def extract_insn_cross_section_cflow(f, bb, insn): continue # skip 64-bit calls to imports - elif insn.mnem == 'call' and isinstance(insn.opers[0], envi.archs.amd64.disasm.Amd64RipRelOper): + elif insn.mnem == "call" and isinstance(insn.opers[0], envi.archs.amd64.disasm.Amd64RipRelOper): op = insn.opers[0] target = op.getOperAddr(insn) @@ -369,7 +372,7 @@ def extract_insn_cross_section_cflow(f, bb, insn): continue if get_section(f.vw, insn.va) != get_section(f.vw, va): - yield Characteristic('cross section flow', True), insn.va + yield Characteristic("cross section flow", True), insn.va except KeyError: continue @@ -378,7 +381,7 @@ def extract_insn_cross_section_cflow(f, bb, insn): # this is a feature that's most relevant at the function scope, # however, its most efficient to extract at the instruction scope. def extract_function_calls_from(f, bb, insn): - if insn.mnem != 'call': + if insn.mnem != "call": return target = None @@ -387,7 +390,7 @@ def extract_function_calls_from(f, bb, insn): if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper): oper = insn.opers[0] target = oper.getOperAddr(insn) - yield Characteristic('calls from', True), target + yield Characteristic("calls from", True), target # call via thunk on x86, # see 9324d1a8ae37a36ae560c37448c9705a at 0x407985 @@ -396,44 +399,44 @@ def extract_function_calls_from(f, bb, insn): # see Lab21-01.exe_:0x140001178 elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386PcRelOper): target = insn.opers[0].getOperValue(insn) - yield Characteristic('calls from', True), target + yield Characteristic("calls from", True), target # call via IAT, x64 elif isinstance(insn.opers[0], envi.archs.amd64.disasm.Amd64RipRelOper): op = insn.opers[0] target = op.getOperAddr(insn) - yield Characteristic('calls from', True), target + yield Characteristic("calls from", True), target if target and target == f.va: # if we found a jump target and it's the function address # mark as recursive - yield Characteristic('recursive call', True), target + yield Characteristic("recursive call", True), target # this is a feature that's most relevant at the function or basic block scope, # however, its most efficient to extract at the instruction scope. def extract_function_indirect_call_characteristic_features(f, bb, insn): - ''' + """ extract indirect function call characteristic (e.g., call eax or call dword ptr [edx+4]) does not include calls like => call ds:dword_ABD4974 - ''' - if insn.mnem != 'call': + """ + if insn.mnem != "call": return # Checks below work for x86 and x64 if isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper): # call edx - yield Characteristic('indirect call', True), insn.va + yield Characteristic("indirect call", True), insn.va elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegMemOper): # call dword ptr [eax+50h] - yield Characteristic('indirect call', True), insn.va + yield Characteristic("indirect call", True), insn.va elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386SibOper): # call qword ptr [rsp+78h] - yield Characteristic('indirect call', True), insn.va + yield Characteristic("indirect call", True), insn.va def extract_features(f, bb, insn): - ''' + """ extract features from the given insn. args: @@ -443,7 +446,7 @@ def extract_features(f, bb, insn): yields: Feature, set[VA]: the features and their location found in this insn. - ''' + """ for insn_handler in INSTRUCTION_HANDLERS: for feature, va in insn_handler(f, bb, insn): yield feature, va @@ -461,5 +464,5 @@ INSTRUCTION_HANDLERS = ( extract_insn_cross_section_cflow, extract_insn_segment_access_features, extract_function_calls_from, - extract_function_indirect_call_characteristic_features + extract_function_indirect_call_characteristic_features, ) diff --git a/capa/features/file.py b/capa/features/file.py index 708b8e2b..c5d16879 100644 --- a/capa/features/file.py +++ b/capa/features/file.py @@ -8,7 +8,7 @@ class Export(Feature): self.value = value def __str__(self): - return 'Export(%s)' % (self.value) + return "Export(%s)" % (self.value) class Import(Feature): @@ -18,7 +18,7 @@ class Import(Feature): self.value = value def __str__(self): - return 'Import(%s)' % (self.value) + return "Import(%s)" % (self.value) class Section(Feature): @@ -28,4 +28,4 @@ class Section(Feature): self.value = value def __str__(self): - return 'Section(%s)' % (self.value) + return "Section(%s)" % (self.value) diff --git a/capa/features/freeze.py b/capa/features/freeze.py index 0499cf7a..bfc20781 100644 --- a/capa/features/freeze.py +++ b/capa/features/freeze.py @@ -1,4 +1,4 @@ -''' +""" capa freeze file format: `| capa0000 | + zlib(utf-8(json(...)))` json format: @@ -39,7 +39,7 @@ json format: ], } } -''' +""" import json import zlib import logging @@ -61,10 +61,7 @@ def serialize_feature(feature): return feature.freeze_serialize() -KNOWN_FEATURES = { - F.__name__: F - for F in capa.features.Feature.__subclasses__() -} +KNOWN_FEATURES = {F.__name__: F for F in capa.features.Feature.__subclasses__()} def deserialize_feature(doc): @@ -73,7 +70,7 @@ def deserialize_feature(doc): def dumps(extractor): - ''' + """ serialize the given extractor to a string args: @@ -81,79 +78,64 @@ def dumps(extractor): returns: str: the serialized features. - ''' - ret = { - 'version': 1, - 'functions': {}, - 'scopes': { - 'file': [], - 'function': [], - 'basic block': [], - 'instruction': [], - } - } + """ + ret = {"version": 1, "functions": {}, "scopes": {"file": [], "function": [], "basic block": [], "instruction": [],}} for feature, va in extractor.extract_file_features(): - ret['scopes']['file'].append( - serialize_feature(feature) + (hex(va), ()) - ) + ret["scopes"]["file"].append(serialize_feature(feature) + (hex(va), ())) for f in extractor.get_functions(): - ret['functions'][hex(f)] = {} + ret["functions"][hex(f)] = {} for feature, va in extractor.extract_function_features(f): - ret['scopes']['function'].append( - serialize_feature(feature) + (hex(va), (hex(f), )) - ) + ret["scopes"]["function"].append(serialize_feature(feature) + (hex(va), (hex(f),))) for bb in extractor.get_basic_blocks(f): - ret['functions'][hex(f)][hex(bb)] = [] + ret["functions"][hex(f)][hex(bb)] = [] for feature, va in extractor.extract_basic_block_features(f, bb): - ret['scopes']['basic block'].append( - serialize_feature(feature) + (hex(va), (hex(f), hex(bb), )) - ) + ret["scopes"]["basic block"].append(serialize_feature(feature) + (hex(va), (hex(f), hex(bb),))) for insn, insnva in sorted([(insn, int(insn)) for insn in extractor.get_instructions(f, bb)]): - ret['functions'][hex(f)][hex(bb)].append(hex(insnva)) + ret["functions"][hex(f)][hex(bb)].append(hex(insnva)) for feature, va in extractor.extract_insn_features(f, bb, insn): - ret['scopes']['instruction'].append( - serialize_feature(feature) + (hex(va), (hex(f), hex(bb), hex(insnva), )) + ret["scopes"]["instruction"].append( + serialize_feature(feature) + (hex(va), (hex(f), hex(bb), hex(insnva),)) ) return json.dumps(ret) def loads(s): - '''deserialize a set of features (as a NullFeatureExtractor) from a string.''' + """deserialize a set of features (as a NullFeatureExtractor) from a string.""" doc = json.loads(s) - if doc.get('version') != 1: - raise ValueError('unsupported freeze format version: %d' % (doc.get('version'))) + if doc.get("version") != 1: + raise ValueError("unsupported freeze format version: %d" % (doc.get("version"))) features = { - 'file features': [], - 'functions': {}, + "file features": [], + "functions": {}, } - for fva, function in doc.get('functions', {}).items(): + for fva, function in doc.get("functions", {}).items(): fva = int(fva, 0x10) - features['functions'][fva] = { - 'features': [], - 'basic blocks': {}, + features["functions"][fva] = { + "features": [], + "basic blocks": {}, } for bbva, bb in function.items(): bbva = int(bbva, 0x10) - features['functions'][fva]['basic blocks'][bbva] = { - 'features': [], - 'instructions': {}, + features["functions"][fva]["basic blocks"][bbva] = { + "features": [], + "instructions": {}, } for insnva in bb: insnva = int(insnva, 0x10) - features['functions'][fva]['basic blocks'][bbva]['instructions'][insnva] = { - 'features': [], + features["functions"][fva]["basic blocks"][bbva]["instructions"][insnva] = { + "features": [], } # in the following blocks, each entry looks like: @@ -161,13 +143,13 @@ def loads(s): # ('MatchedRule', ('foo', ), '0x401000', ('0x401000', )) # ^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^ ^^^^^^^^^^^^^^ # feature name args addr func/bb/insn - for feature in doc.get('scopes', {}).get('file', []): + for feature in doc.get("scopes", {}).get("file", []): va, loc = feature[2:] va = int(va, 0x10) feature = deserialize_feature(feature[:2]) - features['file features'].append((va, feature)) + features["file features"].append((va, feature)) - for feature in doc.get('scopes', {}).get('function', []): + for feature in doc.get("scopes", {}).get("function", []): # fetch the pair like: # # ('0x401000', ('0x401000', )) @@ -183,42 +165,42 @@ def loads(s): # ^^^^^^^^^^^^^ ^^^^^^^^^ # feature name args feature = deserialize_feature(feature[:2]) - features['functions'][loc[0]]['features'].append((va, feature)) + features["functions"][loc[0]]["features"].append((va, feature)) - for feature in doc.get('scopes', {}).get('basic block', []): + for feature in doc.get("scopes", {}).get("basic block", []): va, loc = feature[2:] va = int(va, 0x10) loc = [int(lo, 0x10) for lo in loc] feature = deserialize_feature(feature[:2]) - features['functions'][loc[0]]['basic blocks'][loc[1]]['features'].append((va, feature)) + features["functions"][loc[0]]["basic blocks"][loc[1]]["features"].append((va, feature)) - for feature in doc.get('scopes', {}).get('instruction', []): + for feature in doc.get("scopes", {}).get("instruction", []): va, loc = feature[2:] va = int(va, 0x10) loc = [int(lo, 0x10) for lo in loc] feature = deserialize_feature(feature[:2]) - features['functions'][loc[0]]['basic blocks'][loc[1]]['instructions'][loc[2]]['features'].append((va, feature)) + features["functions"][loc[0]]["basic blocks"][loc[1]]["instructions"][loc[2]]["features"].append((va, feature)) return capa.features.extractors.NullFeatureExtractor(features) -MAGIC = 'capa0000'.encode('ascii') +MAGIC = "capa0000".encode("ascii") def dump(extractor): - '''serialize the given extractor to a byte array.''' - return MAGIC + zlib.compress(dumps(extractor).encode('utf-8')) + """serialize the given extractor to a byte array.""" + return MAGIC + zlib.compress(dumps(extractor).encode("utf-8")) def is_freeze(buf): - return buf[:len(MAGIC)] == MAGIC + return buf[: len(MAGIC)] == MAGIC def load(buf): - '''deserialize a set of features (as a NullFeatureExtractor) from a byte array.''' + """deserialize a set of features (as a NullFeatureExtractor) from a byte array.""" if not is_freeze(buf): - raise ValueError('missing magic header') - return loads(zlib.decompress(buf[len(MAGIC):]).decode('utf-8')) + raise ValueError("missing magic header") + return loads(zlib.decompress(buf[len(MAGIC) :]).decode("utf-8")) def main(argv=None): @@ -230,24 +212,21 @@ def main(argv=None): argv = sys.argv[1:] formats = [ - ('auto', '(default) detect file type automatically'), - ('pe', 'Windows PE file'), - ('sc32', '32-bit shellcode'), - ('sc64', '64-bit shellcode'), + ("auto", "(default) detect file type automatically"), + ("pe", "Windows PE file"), + ("sc32", "32-bit shellcode"), + ("sc64", "64-bit shellcode"), ] - format_help = ', '.join(['%s: %s' % (f[0], f[1]) for f in formats]) + format_help = ", ".join(["%s: %s" % (f[0], f[1]) for f in formats]) - parser = argparse.ArgumentParser(description='save capa features to a file') - parser.add_argument('sample', type=str, - help='Path to sample to analyze') - parser.add_argument('output', type=str, - help='Path to output file') - parser.add_argument('-v', '--verbose', action='store_true', - help='Enable verbose output') - parser.add_argument('-q', '--quiet', action='store_true', - help='Disable all output but errors') - parser.add_argument('-f', '--format', choices=[f[0] for f in formats], default='auto', - help='Select sample format, %s' % format_help) + parser = argparse.ArgumentParser(description="save capa features to a file") + parser.add_argument("sample", type=str, help="Path to sample to analyze") + parser.add_argument("output", type=str, help="Path to output file") + parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output") + parser.add_argument("-q", "--quiet", action="store_true", help="Disable all output but errors") + parser.add_argument( + "-f", "--format", choices=[f[0] for f in formats], default="auto", help="Select sample format, %s" % format_help + ) args = parser.parse_args(args=argv) if args.quiet: @@ -264,13 +243,15 @@ def main(argv=None): # don't import this at top level to support ida/py3 backend import capa.features.extractors.viv + extractor = capa.features.extractors.viv.VivisectFeatureExtractor(vw, args.sample) - with open(args.output, 'wb') as f: + with open(args.output, "wb") as f: f.write(dump(extractor)) return 0 -if __name__ == '__main__': +if __name__ == "__main__": import sys + sys.exit(main()) diff --git a/capa/features/insn.py b/capa/features/insn.py index b8ebf9da..63abae98 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -4,9 +4,9 @@ from capa.features import Feature class API(Feature): def __init__(self, name): # Downcase library name if given - if '.' in name: - modname, impname = name.split('.') - name = modname.lower() + '.' + impname + if "." in name: + modname, impname = name.split(".") + name = modname.lower() + "." + impname super(API, self).__init__([name]) @@ -19,9 +19,9 @@ class Number(Feature): def __str__(self): if self.symbol: - return 'number(0x%x = %s)' % (self.value, self.symbol) + return "number(0x%x = %s)" % (self.value, self.symbol) else: - return 'number(0x%x)' % (self.value) + return "number(0x%x)" % (self.value) class Offset(Feature): @@ -32,9 +32,9 @@ class Offset(Feature): def __str__(self): if self.symbol: - return 'offset(0x%x = %s)' % (self.value, self.symbol) + return "offset(0x%x = %s)" % (self.value, self.symbol) else: - return 'offset(0x%x)' % (self.value) + return "offset(0x%x)" % (self.value) class Mnemonic(Feature): @@ -43,4 +43,4 @@ class Mnemonic(Feature): self.value = value def __str__(self): - return 'mnemonic(%s)' % (self.value) + return "mnemonic(%s)" % (self.value) diff --git a/capa/helpers.py b/capa/helpers.py index 20329b48..81007abb 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -4,7 +4,7 @@ _hex = hex def hex(i): # under py2.7, long integers get formatted with a trailing `L` # and this is not pretty. so strip it out. - return _hex(oint(i)).rstrip('L') + return _hex(oint(i)).rstrip("L") def oint(i): diff --git a/capa/ida/explorer/item.py b/capa/ida/explorer/item.py index 729faa4a..35d8b54c 100644 --- a/capa/ida/explorer/item.py +++ b/capa/ida/explorer/item.py @@ -15,14 +15,14 @@ def info_to_name(display): e.g. function(my_function) => my_function """ try: - return display.split('(')[1].rstrip(')') + return display.split("(")[1].rstrip(")") except IndexError: - return '' + return "" def location_to_hex(location): """ convert location to hex for display """ - return '%08X' % location + return "%08X" % location class CapaExplorerDataItem(object): @@ -35,7 +35,12 @@ class CapaExplorerDataItem(object): self.children = [] self._checked = False - self.flags = (QtCore.Qt.ItemIsEnabled | QtCore.Qt.ItemIsSelectable | QtCore.Qt.ItemIsTristate | QtCore.Qt.ItemIsUserCheckable) + self.flags = ( + QtCore.Qt.ItemIsEnabled + | QtCore.Qt.ItemIsSelectable + | QtCore.Qt.ItemIsTristate + | QtCore.Qt.ItemIsUserCheckable + ) if self.pred: self.pred.appendChild(self) @@ -109,7 +114,7 @@ class CapaExplorerDataItem(object): def __str__(self): """ get string representation of columns """ - return ' '.join([data for data in self._data if data]) + return " ".join([data for data in self._data if data]) @property def info(self): @@ -133,12 +138,12 @@ class CapaExplorerDataItem(object): class CapaExplorerRuleItem(CapaExplorerDataItem): """ store data relevant to capa function result """ - fmt = '%s (%d matches)' + fmt = "%s (%d matches)" def __init__(self, parent, display, count, source): """ """ display = self.fmt % (display, count) if count > 1 else display - super(CapaExplorerRuleItem, self).__init__(parent, [display, '', '']) + super(CapaExplorerRuleItem, self).__init__(parent, [display, "", ""]) self._source = source @property @@ -150,9 +155,9 @@ class CapaExplorerRuleItem(CapaExplorerDataItem): class CapaExplorerRuleMatchItem(CapaExplorerDataItem): """ store data relevant to capa function match result """ - def __init__(self, parent, display, source=''): + def __init__(self, parent, display, source=""): """ """ - super(CapaExplorerRuleMatchItem, self).__init__(parent, [display, '', '']) + super(CapaExplorerRuleMatchItem, self).__init__(parent, [display, "", ""]) self._source = source @property @@ -164,12 +169,13 @@ class CapaExplorerRuleMatchItem(CapaExplorerDataItem): class CapaExplorerFunctionItem(CapaExplorerDataItem): """ store data relevant to capa function result """ - fmt = 'function(%s)' + fmt = "function(%s)" def __init__(self, parent, location): """ """ - super(CapaExplorerFunctionItem, self).__init__(parent, [self.fmt % idaapi.get_name(location), - location_to_hex(location), '']) + super(CapaExplorerFunctionItem, self).__init__( + parent, [self.fmt % idaapi.get_name(location), location_to_hex(location), ""] + ) @property def info(self): @@ -187,32 +193,31 @@ class CapaExplorerFunctionItem(CapaExplorerDataItem): class CapaExplorerBlockItem(CapaExplorerDataItem): """ store data relevant to capa basic block result """ - fmt = 'basic block(loc_%08X)' + fmt = "basic block(loc_%08X)" def __init__(self, parent, location): """ """ - super(CapaExplorerBlockItem, self).__init__(parent, [self.fmt % location, location_to_hex(location), '']) + super(CapaExplorerBlockItem, self).__init__(parent, [self.fmt % location, location_to_hex(location), ""]) class CapaExplorerDefaultItem(CapaExplorerDataItem): """ store data relevant to capa default result """ - def __init__(self, parent, display, details='', location=None): + def __init__(self, parent, display, details="", location=None): """ """ - location = location_to_hex(location) if location else '' + location = location_to_hex(location) if location else "" super(CapaExplorerDefaultItem, self).__init__(parent, [display, location, details]) class CapaExplorerFeatureItem(CapaExplorerDataItem): """ store data relevant to capa feature result """ - def __init__(self, parent, display, location='', details=''): - location = location_to_hex(location) if location else '' + def __init__(self, parent, display, location="", details=""): + location = location_to_hex(location) if location else "" super(CapaExplorerFeatureItem, self).__init__(parent, [display, location, details]) class CapaExplorerInstructionViewItem(CapaExplorerFeatureItem): - def __init__(self, parent, display, location): """ """ details = capa.ida.helpers.get_disasm_line(location) @@ -221,26 +226,24 @@ class CapaExplorerInstructionViewItem(CapaExplorerFeatureItem): class CapaExplorerByteViewItem(CapaExplorerFeatureItem): - def __init__(self, parent, display, location): """ """ byte_snap = idaapi.get_bytes(location, 32) if byte_snap: - byte_snap = codecs.encode(byte_snap, 'hex').upper() + byte_snap = codecs.encode(byte_snap, "hex").upper() if sys.version_info >= (3, 0): - details = ' '.join([byte_snap[i:i + 2].decode() for i in range(0, len(byte_snap), 2)]) + details = " ".join([byte_snap[i : i + 2].decode() for i in range(0, len(byte_snap), 2)]) else: - details = ' '.join([byte_snap[i:i + 2] for i in range(0, len(byte_snap), 2)]) + details = " ".join([byte_snap[i : i + 2] for i in range(0, len(byte_snap), 2)]) else: - details = '' + details = "" super(CapaExplorerByteViewItem, self).__init__(parent, display, location=location, details=details) self.ida_highlight = idc.get_color(location, idc.CIC_ITEM) class CapaExplorerStringViewItem(CapaExplorerFeatureItem): - def __init__(self, parent, display, location): """ """ super(CapaExplorerStringViewItem, self).__init__(parent, display, location=location) diff --git a/capa/ida/explorer/model.py b/capa/ida/explorer/model.py index 7b3b21c7..ce55a039 100644 --- a/capa/ida/explorer/model.py +++ b/capa/ida/explorer/model.py @@ -16,7 +16,7 @@ from capa.ida.explorer.item import ( CapaExplorerByteViewItem, CapaExplorerBlockItem, CapaExplorerRuleMatchItem, - CapaExplorerFeatureItem + CapaExplorerFeatureItem, ) import capa.ida.helpers @@ -37,7 +37,7 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): def __init__(self, parent=None): """ """ super(CapaExplorerDataModel, self).__init__(parent) - self.root_node = CapaExplorerDataItem(None, ['Rule Information', 'Address', 'Details']) + self.root_node = CapaExplorerDataItem(None, ["Rule Information", "Address", "Details"]) def reset(self): """ """ @@ -86,8 +86,11 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): # display data in corresponding column return item.data(column) - if role == QtCore.Qt.ToolTipRole and isinstance(item, (CapaExplorerRuleItem, CapaExplorerRuleMatchItem)) and \ - CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION == column: + if ( + role == QtCore.Qt.ToolTipRole + and isinstance(item, (CapaExplorerRuleItem, CapaExplorerRuleMatchItem)) + and CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION == column + ): # show tooltip containing rule source return item.source @@ -95,18 +98,30 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): # inform view how to display content of checkbox - un/checked return QtCore.Qt.Checked if item.isChecked() else QtCore.Qt.Unchecked - if role == QtCore.Qt.FontRole and column in (CapaExplorerDataModel.COLUMN_INDEX_VIRTUAL_ADDRESS, - CapaExplorerDataModel.COLUMN_INDEX_DETAILS): + if role == QtCore.Qt.FontRole and column in ( + CapaExplorerDataModel.COLUMN_INDEX_VIRTUAL_ADDRESS, + CapaExplorerDataModel.COLUMN_INDEX_DETAILS, + ): # set font for virtual address and details columns - font = QtGui.QFont('Courier', weight=QtGui.QFont.Medium) + font = QtGui.QFont("Courier", weight=QtGui.QFont.Medium) if column == CapaExplorerDataModel.COLUMN_INDEX_VIRTUAL_ADDRESS: font.setBold(True) return font - if role == QtCore.Qt.FontRole and isinstance(item, (CapaExplorerRuleItem, CapaExplorerRuleMatchItem, - CapaExplorerBlockItem, CapaExplorerFunctionItem, - CapaExplorerFeatureItem)) and \ - column == CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION: + if ( + role == QtCore.Qt.FontRole + and isinstance( + item, + ( + CapaExplorerRuleItem, + CapaExplorerRuleMatchItem, + CapaExplorerBlockItem, + CapaExplorerFunctionItem, + CapaExplorerFeatureItem, + ), + ) + and column == CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION + ): # set bold font for top-level rules font = QtGui.QFont() font.setBold(True) @@ -116,8 +131,11 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): # set color for virtual address column return QtGui.QColor(88, 139, 174) - if role == QtCore.Qt.ForegroundRole and isinstance(item, CapaExplorerFeatureItem) and column == \ - CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION: + if ( + role == QtCore.Qt.ForegroundRole + and isinstance(item, CapaExplorerFeatureItem) + and column == CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION + ): # set color for feature items return QtGui.QColor(79, 121, 66) @@ -222,8 +240,9 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): def util_reset_ida_highlighting(self, item, checked): """ """ - if not isinstance(item, (CapaExplorerStringViewItem, CapaExplorerInstructionViewItem, - CapaExplorerByteViewItem)): + if not isinstance( + item, (CapaExplorerStringViewItem, CapaExplorerInstructionViewItem, CapaExplorerByteViewItem) + ): # ignore other item types return @@ -254,8 +273,10 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): if not model_index.isValid(): return False - if role == QtCore.Qt.CheckStateRole and model_index.column() ==\ - CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION: + if ( + role == QtCore.Qt.CheckStateRole + and model_index.column() == CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION + ): # user un/checked box - un/check parent and children for child_index in self.iterateChildrenIndexFromRootIndex(model_index, ignore_root=False): child_index.internalPointer().setChecked(value) @@ -263,9 +284,12 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): self.dataChanged.emit(child_index, child_index) return True - if role == QtCore.Qt.EditRole and value and \ - model_index.column() == CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION and \ - isinstance(model_index.internalPointer(), CapaExplorerFunctionItem): + if ( + role == QtCore.Qt.EditRole + and value + and model_index.column() == CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION + and isinstance(model_index.internalPointer(), CapaExplorerFunctionItem) + ): # user renamed function - update IDA database and data model old_name = model_index.internalPointer().info new_name = str(value) @@ -309,39 +333,39 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): "type": "or" }, """ - if statement['type'] in ('and', 'or', 'optional'): - return CapaExplorerDefaultItem(parent, statement['type']) - elif statement['type'] == 'not': + if statement["type"] in ("and", "or", "optional"): + return CapaExplorerDefaultItem(parent, statement["type"]) + elif statement["type"] == "not": # TODO: do we display 'not' pass - elif statement['type'] == 'some': - return CapaExplorerDefaultItem(parent, statement['count'] + ' or more') - elif statement['type'] == 'range': + elif statement["type"] == "some": + return CapaExplorerDefaultItem(parent, statement["count"] + " or more") + elif statement["type"] == "range": # `range` is a weird node, its almost a hybrid of statement + feature. # it is a specific feature repeated multiple times. # there's no additional logic in the feature part, just the existence of a feature. # so, we have to inline some of the feature rendering here. - display = 'count(%s): ' % self.capa_doc_feature_to_display(statement['child']) + display = "count(%s): " % self.capa_doc_feature_to_display(statement["child"]) - if statement['max'] == statement['min']: - display += '%d' % (statement['min']) - elif statement['min'] == 0: - display += '%d or fewer' % (statement['max']) - elif statement['max'] == (1 << 64 - 1): - display += '%d or more' % (statement['min']) + if statement["max"] == statement["min"]: + display += "%d" % (statement["min"]) + elif statement["min"] == 0: + display += "%d or fewer" % (statement["max"]) + elif statement["max"] == (1 << 64 - 1): + display += "%d or more" % (statement["min"]) else: - display += 'between %d and %d' % (statement['min'], statement['max']) + display += "between %d and %d" % (statement["min"], statement["max"]) return CapaExplorerFeatureItem(parent, display=display) - elif statement['type'] == 'subscope': - return CapaExplorerFeatureItem(parent, 'subscope(%s)' % statement['subscope']) - elif statement['type'] == 'regex': + elif statement["type"] == "subscope": + return CapaExplorerFeatureItem(parent, "subscope(%s)" % statement["subscope"]) + elif statement["type"] == "regex": # regex is a `Statement` not a `Feature` # this is because it doesn't get extracted, but applies to all strings in scope. # so we have to handle it here - return CapaExplorerFeatureItem(parent, 'regex(%s)' % statement['pattern'], details=statement['match']) + return CapaExplorerFeatureItem(parent, "regex(%s)" % statement["pattern"], details=statement["match"]) else: - raise RuntimeError('unexpected match statement type: ' + str(statement)) + raise RuntimeError("unexpected match statement type: " + str(statement)) def render_capa_doc_match(self, parent, match, doc): """ render capa match read from doc @@ -367,23 +391,24 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): } }, """ - if not match['success']: + if not match["success"]: # TODO: display failed branches at some point? Help with debugging rules? return # optional statement with no successful children is empty - if (match['node'].get('statement', {}).get('type') == 'optional' - and not any(map(lambda m: m['success'], match['children']))): + if match["node"].get("statement", {}).get("type") == "optional" and not any( + map(lambda m: m["success"], match["children"]) + ): return - if match['node']['type'] == 'statement': - parent2 = self.render_capa_doc_statement_node(parent, match['node']['statement'], doc) - elif match['node']['type'] == 'feature': - parent2 = self.render_capa_doc_feature_node(parent, match['node']['feature'], match['locations'], doc) + if match["node"]["type"] == "statement": + parent2 = self.render_capa_doc_statement_node(parent, match["node"]["statement"], doc) + elif match["node"]["type"] == "feature": + parent2 = self.render_capa_doc_feature_node(parent, match["node"]["feature"], match["locations"], doc) else: - raise RuntimeError('unexpected node type: ' + str(match['node']['type'])) + raise RuntimeError("unexpected node type: " + str(match["node"]["type"])) - for child in match['children']: + for child in match["children"]: self.render_capa_doc_match(parent2, child, doc) def render_capa_doc(self, doc): @@ -394,17 +419,17 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): self.beginResetModel() for rule in rutils.capability_rules(doc): - parent = CapaExplorerRuleItem(self.root_node, rule['meta']['name'], len(rule['matches']), rule['source']) + parent = CapaExplorerRuleItem(self.root_node, rule["meta"]["name"], len(rule["matches"]), rule["source"]) - for (location, match) in doc[rule['meta']['name']]['matches'].items(): - if rule['meta']['scope'] == capa.rules.FILE_SCOPE: + for (location, match) in doc[rule["meta"]["name"]]["matches"].items(): + if rule["meta"]["scope"] == capa.rules.FILE_SCOPE: parent2 = parent - elif rule['meta']['scope'] == capa.rules.FUNCTION_SCOPE: + elif rule["meta"]["scope"] == capa.rules.FUNCTION_SCOPE: parent2 = CapaExplorerFunctionItem(parent, location) - elif rule['meta']['scope'] == capa.rules.BASIC_BLOCK_SCOPE: + elif rule["meta"]["scope"] == capa.rules.BASIC_BLOCK_SCOPE: parent2 = CapaExplorerBlockItem(parent, location) else: - raise RuntimeError('unexpected rule scope: ' + str(rule['meta']['scope'])) + raise RuntimeError("unexpected rule scope: " + str(rule["meta"]["scope"])) self.render_capa_doc_match(parent2, match, doc) @@ -421,20 +446,20 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): }, """ mapping = { - 'string': 'string(%s)', - 'bytes': 'bytes(%s)', - 'api': 'api(%s)', - 'mnemonic': 'mnemonic(%s)', - 'export': 'export(%s)', - 'import': 'import(%s)', - 'section': 'section(%s)', - 'number': 'number(0x%X)', - 'offset': 'offset(0x%X)', - 'characteristic': 'characteristic(%s)', - 'match': 'rule match(%s)' + "string": "string(%s)", + "bytes": "bytes(%s)", + "api": "api(%s)", + "mnemonic": "mnemonic(%s)", + "export": "export(%s)", + "import": "import(%s)", + "section": "section(%s)", + "number": "number(0x%X)", + "offset": "offset(0x%X)", + "characteristic": "characteristic(%s)", + "match": "rule match(%s)", } - ''' + """ "feature": { "characteristic": [ "loop", @@ -442,21 +467,23 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): ], "type": "characteristic" }, - ''' - if feature['type'] == 'characteristic': - return mapping['characteristic'] % feature['characteristic'][0] + """ + if feature["type"] == "characteristic": + return mapping["characteristic"] % feature["characteristic"][0] # convert bytes feature from "410ab4" to "41 0A B4" - if feature['type'] == 'bytes': - return mapping['bytes'] % ' '.join(feature['bytes'][i:i + 2] for i in - range(0, len(feature['bytes']), 2)).upper() + if feature["type"] == "bytes": + return ( + mapping["bytes"] + % " ".join(feature["bytes"][i : i + 2] for i in range(0, len(feature["bytes"]), 2)).upper() + ) try: - fmt = mapping[feature['type']] + fmt = mapping[feature["type"]] except KeyError: - raise RuntimeError('unexpected doc type: ' + str(feature['type'])) + raise RuntimeError("unexpected doc type: " + str(feature["type"])) - return fmt % feature[feature['type']] + return fmt % feature[feature["type"]] def render_capa_doc_feature_node(self, parent, feature, locations, doc): """ """ @@ -473,7 +500,7 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): return parent2 - def render_capa_doc_feature(self, parent, feature, location, doc, display='-'): + def render_capa_doc_feature(self, parent, feature, location, doc, display="-"): """ render capa feature read from doc @param parent: parent node to which new child is assigned @@ -491,51 +518,38 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): @param location: address of feature @param display: text to display in plugin ui """ - instruction_view = ( - 'bytes', - 'api', - 'mnemonic', - 'number', - 'offset' - ) - byte_view = ( - 'section', - ) - string_view = ( - 'string', - ) - default_feature_view = ( - 'import', - 'export' - ) + instruction_view = ("bytes", "api", "mnemonic", "number", "offset") + byte_view = ("section",) + string_view = ("string",) + default_feature_view = ("import", "export") # special handling for characteristic pending type - if feature['type'] == 'characteristic': - if feature['characteristic'][0] in ('embedded pe',): + if feature["type"] == "characteristic": + if feature["characteristic"][0] in ("embedded pe",): return CapaExplorerByteViewItem(parent, display, location) - if feature['characteristic'][0] in ('loop', 'recursive call', 'tight loop', 'switch'): + if feature["characteristic"][0] in ("loop", "recursive call", "tight loop", "switch"): return CapaExplorerFeatureItem(parent, display=display) # default to instruction view return CapaExplorerInstructionViewItem(parent, display, location) - if feature['type'] == 'match': - return CapaExplorerRuleMatchItem(parent, display, source=doc.get(feature['match'], {}).get('source', '')) + if feature["type"] == "match": + return CapaExplorerRuleMatchItem(parent, display, source=doc.get(feature["match"], {}).get("source", "")) - if feature['type'] in instruction_view: + if feature["type"] in instruction_view: return CapaExplorerInstructionViewItem(parent, display, location) - if feature['type'] in byte_view: + if feature["type"] in byte_view: return CapaExplorerByteViewItem(parent, display, location) - if feature['type'] in string_view: + if feature["type"] in string_view: return CapaExplorerStringViewItem(parent, display, location) - if feature['type'] in default_feature_view: + if feature["type"] in default_feature_view: return CapaExplorerFeatureItem(parent, display=display) - raise RuntimeError('unexpected feature type: ' + str(feature['type'])) + raise RuntimeError("unexpected feature type: " + str(feature["type"])) def update_function_name(self, old_name, new_name): """ update all instances of function name @@ -548,8 +562,9 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): # convert name to view format for matching old_name = CapaExplorerFunctionItem.fmt % old_name - for model_index in self.match(root_index, QtCore.Qt.DisplayRole, old_name, hits=-1, - flags=QtCore.Qt.MatchRecursive): + for model_index in self.match( + root_index, QtCore.Qt.DisplayRole, old_name, hits=-1, flags=QtCore.Qt.MatchRecursive + ): if not isinstance(model_index.internalPointer(), CapaExplorerFunctionItem): continue diff --git a/capa/ida/explorer/proxy.py b/capa/ida/explorer/proxy.py index 811eff16..9ebc4eb2 100644 --- a/capa/ida/explorer/proxy.py +++ b/capa/ida/explorer/proxy.py @@ -4,7 +4,6 @@ from capa.ida.explorer.model import CapaExplorerDataModel class CapaExplorerSortFilterProxyModel(QtCore.QSortFilterProxyModel): - def __init__(self, parent=None): """ """ super(CapaExplorerSortFilterProxyModel, self).__init__(parent) @@ -20,8 +19,12 @@ class CapaExplorerSortFilterProxyModel(QtCore.QSortFilterProxyModel): ldata = left.internalPointer().data(left.column()) rdata = right.internalPointer().data(right.column()) - if ldata and rdata and left.column() == CapaExplorerDataModel.COLUMN_INDEX_VIRTUAL_ADDRESS and left.column() \ - == right.column(): + if ( + ldata + and rdata + and left.column() == CapaExplorerDataModel.COLUMN_INDEX_VIRTUAL_ADDRESS + and left.column() == right.column() + ): # convert virtual address before compare return int(ldata, 16) < int(rdata, 16) else: diff --git a/capa/ida/explorer/view.py b/capa/ida/explorer/view.py index d2a8eb46..b43311da 100644 --- a/capa/ida/explorer/view.py +++ b/capa/ida/explorer/view.py @@ -55,7 +55,7 @@ class CapaExplorerQtreeView(QtWidgets.QTreeView): self.doubleClicked.connect(self.slot_double_click) # self.clicked.connect(self.slot_click) - self.setStyleSheet('QTreeView::item {padding-right: 15 px;padding-bottom: 2 px;}') + self.setStyleSheet("QTreeView::item {padding-right: 15 px;padding-bottom: 2 px;}") def reset(self): """ reset user interface changes @@ -114,8 +114,8 @@ class CapaExplorerQtreeView(QtWidgets.QTreeView): @yield QAction* """ default_actions = [ - ('Copy column', data, self.slot_copy_column), - ('Copy row', data, self.slot_copy_row), + ("Copy column", data, self.slot_copy_column), + ("Copy row", data, self.slot_copy_row), ] # add default actions @@ -130,7 +130,7 @@ class CapaExplorerQtreeView(QtWidgets.QTreeView): @yield QAction* """ function_actions = [ - ('Rename function', data, self.slot_rename_function), + ("Rename function", data, self.slot_rename_function), ] # add function actions diff --git a/capa/ida/helpers/__init__.py b/capa/ida/helpers/__init__.py index be6402e3..7d29e2ec 100644 --- a/capa/ida/helpers/__init__.py +++ b/capa/ida/helpers/__init__.py @@ -3,47 +3,48 @@ import logging import idaapi import idc -logger = logging.getLogger('capa') +logger = logging.getLogger("capa") # file type names as returned by idaapi.get_file_type_name() SUPPORTED_FILE_TYPES = [ - 'Portable executable for 80386 (PE)', - 'Portable executable for AMD64 (PE)', - 'Binary file' # x86/AMD64 shellcode support + "Portable executable for 80386 (PE)", + "Portable executable for AMD64 (PE)", + "Binary file", # x86/AMD64 shellcode support ] def inform_user_ida_ui(message): - idaapi.info('%s. Please refer to IDA Output window for more information.' % message) + idaapi.info("%s. Please refer to IDA Output window for more information." % message) def is_supported_file_type(): file_type = idaapi.get_file_type_name() if file_type not in SUPPORTED_FILE_TYPES: - logger.error('-' * 80) - logger.error(' Input file does not appear to be a PE file.') - logger.error(' ') + logger.error("-" * 80) + logger.error(" Input file does not appear to be a PE file.") + logger.error(" ") logger.error( - ' capa currently only supports analyzing PE files (or binary files containing x86/AMD64 shellcode) with IDA.') - logger.error(' If you don\'t know the input file type, you can try using the `file` utility to guess it.') - logger.error('-' * 80) - inform_user_ida_ui('capa does not support the format of this file') + " capa currently only supports analyzing PE files (or binary files containing x86/AMD64 shellcode) with IDA." + ) + logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.") + logger.error("-" * 80) + inform_user_ida_ui("capa does not support the format of this file") return False return True def get_disasm_line(va): - ''' ''' + """ """ return idc.generate_disasm_line(va, idc.GENDSM_FORCE_CODE) def is_func_start(ea): - ''' check if function stat exists at virtual address ''' + """ check if function stat exists at virtual address """ f = idaapi.get_func(ea) return f and f.start_ea == ea def get_func_start_ea(ea): - ''' ''' + """ """ f = idaapi.get_func(ea) return f if f is None else f.start_ea diff --git a/capa/ida/ida_capa_explorer.py b/capa/ida/ida_capa_explorer.py index 873e0513..1c6f263e 100644 --- a/capa/ida/ida_capa_explorer.py +++ b/capa/ida/ida_capa_explorer.py @@ -2,11 +2,7 @@ import os import logging import collections -from PyQt5 import ( - QtWidgets, - QtGui, - QtCore -) +from PyQt5 import QtWidgets, QtGui, QtCore import idaapi @@ -20,13 +16,12 @@ from capa.ida.explorer.view import CapaExplorerQtreeView from capa.ida.explorer.model import CapaExplorerDataModel from capa.ida.explorer.proxy import CapaExplorerSortFilterProxyModel -PLUGIN_NAME = 'capa explorer' +PLUGIN_NAME = "capa explorer" -logger = logging.getLogger('capa') +logger = logging.getLogger("capa") class CapaExplorerIdaHooks(idaapi.UI_Hooks): - def __init__(self, screen_ea_changed_hook, action_hooks): """ facilitate IDA UI hooks @@ -78,7 +73,6 @@ class CapaExplorerIdaHooks(idaapi.UI_Hooks): class CapaExplorerForm(idaapi.PluginForm): - def __init__(self): """ """ super(CapaExplorerForm, self).__init__() @@ -109,20 +103,20 @@ class CapaExplorerForm(idaapi.PluginForm): self.view_tree.reset() - logger.info('form created.') + logger.info("form created.") def Show(self): """ """ - return idaapi.PluginForm.Show(self, self.form_title, options=( - idaapi.PluginForm.WOPN_TAB | idaapi.PluginForm.WCLS_CLOSE_LATER - )) + return idaapi.PluginForm.Show( + self, self.form_title, options=(idaapi.PluginForm.WOPN_TAB | idaapi.PluginForm.WCLS_CLOSE_LATER) + ) def OnClose(self, form): """ form is closed """ self.unload_ida_hooks() self.ida_reset() - logger.info('form closed.') + logger.info("form closed.") def load_interface(self): """ load user interface """ @@ -165,8 +159,8 @@ class CapaExplorerForm(idaapi.PluginForm): def load_view_summary(self): """ """ table_headers = [ - 'Capability', - 'Namespace', + "Capability", + "Namespace", ] table = QtWidgets.QTableWidget() @@ -180,15 +174,15 @@ class CapaExplorerForm(idaapi.PluginForm): table.setHorizontalHeaderLabels(table_headers) table.horizontalHeader().setDefaultAlignment(QtCore.Qt.AlignLeft) table.setShowGrid(False) - table.setStyleSheet('QTableWidget::item { padding: 25px; }') + table.setStyleSheet("QTableWidget::item { padding: 25px; }") self.view_summary = table def load_view_attack(self): """ """ table_headers = [ - 'ATT&CK Tactic', - 'ATT&CK Technique ', + "ATT&CK Tactic", + "ATT&CK Technique ", ] table = QtWidgets.QTableWidget() @@ -202,13 +196,13 @@ class CapaExplorerForm(idaapi.PluginForm): table.setHorizontalHeaderLabels(table_headers) table.horizontalHeader().setDefaultAlignment(QtCore.Qt.AlignLeft) table.setShowGrid(False) - table.setStyleSheet('QTableWidget::item { padding: 25px; }') + table.setStyleSheet("QTableWidget::item { padding: 25px; }") self.view_attack = table def load_view_checkbox_limit_by(self): """ """ - check = QtWidgets.QCheckBox('Limit results to current function') + check = QtWidgets.QCheckBox("Limit results to current function") check.setChecked(False) check.stateChanged.connect(self.slot_checkbox_limit_by_changed) @@ -231,7 +225,7 @@ class CapaExplorerForm(idaapi.PluginForm): tab = QtWidgets.QWidget() tab.setLayout(layout) - self.view_tabs.addTab(tab, 'Tree View') + self.view_tabs.addTab(tab, "Tree View") def load_view_summary_tab(self): """ """ @@ -241,7 +235,7 @@ class CapaExplorerForm(idaapi.PluginForm): tab = QtWidgets.QWidget() tab.setLayout(layout) - self.view_tabs.addTab(tab, 'Summary') + self.view_tabs.addTab(tab, "Summary") def load_view_attack_tab(self): """ """ @@ -251,16 +245,16 @@ class CapaExplorerForm(idaapi.PluginForm): tab = QtWidgets.QWidget() tab.setLayout(layout) - self.view_tabs.addTab(tab, 'MITRE') + self.view_tabs.addTab(tab, "MITRE") def load_file_menu(self): """ load file menu actions """ actions = ( - ('Reset view', 'Reset plugin view', self.reset), - ('Run analysis', 'Run capa analysis on current database', self.reload), + ("Reset view", "Reset plugin view", self.reset), + ("Run analysis", "Run capa analysis on current database", self.reload), ) - menu = self.view_menu_bar.addMenu('File') + menu = self.view_menu_bar.addMenu("File") for name, _, handle in actions: action = QtWidgets.QAction(name, self.parent) @@ -271,8 +265,8 @@ class CapaExplorerForm(idaapi.PluginForm): def load_ida_hooks(self): """ """ action_hooks = { - 'MakeName': self.ida_hook_rename, - 'EditFunction': self.ida_hook_rename, + "MakeName": self.ida_hook_rename, + "EditFunction": self.ida_hook_rename, } self.ida_hooks = CapaExplorerIdaHooks(self.ida_hook_screen_ea_changed, action_hooks) @@ -300,10 +294,10 @@ class CapaExplorerForm(idaapi.PluginForm): if post: # post action update data model w/ current name - self.model_data.update_function_name(meta.get('prev_name', ''), curr_name) + self.model_data.update_function_name(meta.get("prev_name", ""), curr_name) else: # pre action so save current name for replacement later - meta['prev_name'] = curr_name + meta["prev_name"] = curr_name def ida_hook_screen_ea_changed(self, widget, new_ea, old_ea): """ """ @@ -328,21 +322,21 @@ class CapaExplorerForm(idaapi.PluginForm): match = capa.ida.explorer.item.ea_to_hex_str(new_func_start) else: # navigated to virtual address not in valid function - clear filter - match = '' + match = "" # filter on virtual address to avoid updating filter string if function name is changed self.model_proxy.add_single_string_filter(CapaExplorerDataModel.COLUMN_INDEX_VIRTUAL_ADDRESS, match) def load_capa_results(self): """ """ - logger.info('-' * 80) - logger.info(' Using default embedded rules.') - logger.info(' ') - logger.info(' You can see the current default rule set here:') - logger.info(' https://github.com/fireeye/capa-rules') - logger.info('-' * 80) + logger.info("-" * 80) + logger.info(" Using default embedded rules.") + logger.info(" ") + logger.info(" You can see the current default rule set here:") + logger.info(" https://github.com/fireeye/capa-rules") + logger.info("-" * 80) - rules_path = os.path.join(os.path.dirname(self.file_loc), '../..', 'rules') + rules_path = os.path.join(os.path.dirname(self.file_loc), "../..", "rules") rules = capa.main.get_rules(rules_path) rules = capa.rules.RuleSet(rules) capabilities = capa.main.find_capabilities(rules, capa.features.extractors.ida.IdaFeatureExtractor(), True) @@ -350,27 +344,30 @@ class CapaExplorerForm(idaapi.PluginForm): # support binary files specifically for x86/AMD64 shellcode # warn user binary file is loaded but still allow capa to process it # TODO: check specific architecture of binary files based on how user configured IDA processors - if idaapi.get_file_type_name() == 'Binary file': - logger.warning('-' * 80) - logger.warning(' Input file appears to be a binary file.') - logger.warning(' ') + if idaapi.get_file_type_name() == "Binary file": + logger.warning("-" * 80) + logger.warning(" Input file appears to be a binary file.") + logger.warning(" ") logger.warning( - ' capa currently only supports analyzing binary files containing x86/AMD64 shellcode with IDA.') + " capa currently only supports analyzing binary files containing x86/AMD64 shellcode with IDA." + ) logger.warning( - ' This means the results may be misleading or incomplete if the binary file loaded in IDA is not x86/AMD64.') - logger.warning(' If you don\'t know the input file type, you can try using the `file` utility to guess it.') - logger.warning('-' * 80) + " This means the results may be misleading or incomplete if the binary file loaded in IDA is not x86/AMD64." + ) + logger.warning(" If you don't know the input file type, you can try using the `file` utility to guess it.") + logger.warning("-" * 80) - capa.ida.helpers.inform_user_ida_ui('capa encountered warnings during analysis') + capa.ida.helpers.inform_user_ida_ui("capa encountered warnings during analysis") if capa.main.has_file_limitation(rules, capabilities, is_standalone=False): - capa.ida.helpers.inform_user_ida_ui('capa encountered warnings during analysis') + capa.ida.helpers.inform_user_ida_ui("capa encountered warnings during analysis") - logger.info('analysis completed.') + logger.info("analysis completed.") doc = capa.render.convert_capabilities_to_result_document(rules, capabilities) import json + with open("C:\\Users\\spring\\Desktop\\hmm.json", "w") as twitter_data_file: json.dump(doc, twitter_data_file, indent=4, sort_keys=True, cls=capa.render.CapaJsonObjectEncoder) @@ -380,22 +377,22 @@ class CapaExplorerForm(idaapi.PluginForm): self.view_tree.sortByColumn(CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION, QtCore.Qt.AscendingOrder) - logger.info('render views completed.') + logger.info("render views completed.") def render_capa_doc_summary(self, doc): """ """ for (row, rule) in enumerate(rutils.capability_rules(doc)): - count = len(rule['matches']) + count = len(rule["matches"]) if count == 1: - capability = rule['meta']['name'] + capability = rule["meta"]["name"] else: - capability = '%s (%d matches)' % (rule['meta']['name'], count) + capability = "%s (%d matches)" % (rule["meta"]["name"], count) self.view_summary.setRowCount(row + 1) self.view_summary.setItem(row, 0, self.render_new_table_header_item(capability)) - self.view_summary.setItem(row, 1, QtWidgets.QTableWidgetItem(rule['meta']['namespace'])) + self.view_summary.setItem(row, 1, QtWidgets.QTableWidgetItem(rule["meta"]["namespace"])) # resize columns to content self.view_summary.resizeColumnsToContents() @@ -404,17 +401,17 @@ class CapaExplorerForm(idaapi.PluginForm): """ """ tactics = collections.defaultdict(set) for rule in rutils.capability_rules(doc): - if not rule['meta'].get('att&ck'): + if not rule["meta"].get("att&ck"): continue - for attack in rule['meta']['att&ck']: - tactic, _, rest = attack.partition('::') - if '::' in rest: - technique, _, rest = rest.partition('::') - subtechnique, _, id = rest.rpartition(' ') + for attack in rule["meta"]["att&ck"]: + tactic, _, rest = attack.partition("::") + if "::" in rest: + technique, _, rest = rest.partition("::") + subtechnique, _, id = rest.rpartition(" ") tactics[tactic].add((technique, subtechnique, id)) else: - technique, _, id = rest.rpartition(' ') + technique, _, id = rest.rpartition(" ") tactics[tactic].add((technique, id)) column_one = [] @@ -422,17 +419,17 @@ class CapaExplorerForm(idaapi.PluginForm): for tactic, techniques in sorted(tactics.items()): column_one.append(tactic.upper()) - column_one.extend(['' for i in range(len(techniques) - 1)]) + column_one.extend(["" for i in range(len(techniques) - 1)]) for spec in sorted(techniques): if len(spec) == 2: technique, id = spec - column_two.append('%s %s' % (technique, id)) + column_two.append("%s %s" % (technique, id)) elif len(spec) == 3: technique, subtechnique, id = spec - column_two.append('%s::%s %s' % (technique, subtechnique, id)) + column_two.append("%s::%s %s" % (technique, subtechnique, id)) else: - raise RuntimeError('unexpected ATT&CK spec format') + raise RuntimeError("unexpected ATT&CK spec format") self.view_attack.setRowCount(max(len(column_one), len(column_two))) @@ -471,8 +468,8 @@ class CapaExplorerForm(idaapi.PluginForm): self.view_summary.setRowCount(0) self.load_capa_results() - logger.info('reload complete.') - idaapi.info('%s reload completed.' % PLUGIN_NAME) + logger.info("reload complete.") + idaapi.info("%s reload completed." % PLUGIN_NAME) def reset(self): """ reset user interface elements @@ -481,8 +478,8 @@ class CapaExplorerForm(idaapi.PluginForm): """ self.ida_reset() - logger.info('reset completed.') - idaapi.info('%s reset completed.' % PLUGIN_NAME) + logger.info("reset completed.") + idaapi.info("%s reset completed." % PLUGIN_NAME) def slot_menu_bar_hovered(self, action): """ display menu action tooltip @@ -491,7 +488,9 @@ class CapaExplorerForm(idaapi.PluginForm): @reference: https://stackoverflow.com/questions/21725119/why-wont-qtooltips-appear-on-qactions-within-a-qmenu """ - QtWidgets.QToolTip.showText(QtGui.QCursor.pos(), action.toolTip(), self.view_menu_bar, self.view_menu_bar.actionGeometry(action)) + QtWidgets.QToolTip.showText( + QtGui.QCursor.pos(), action.toolTip(), self.view_menu_bar, self.view_menu_bar.actionGeometry(action) + ) def slot_checkbox_limit_by_changed(self): """ slot activated if checkbox clicked @@ -499,7 +498,7 @@ class CapaExplorerForm(idaapi.PluginForm): if checked, configure function filter if screen location is located in function, otherwise clear filter """ - match = '' + match = "" if self.view_checkbox_limit_by.isChecked(): location = capa.ida.helpers.get_func_start_ea(idaapi.get_screen_ea()) if location: @@ -530,5 +529,5 @@ def main(): CAPA_EXPLORER_FORM.Show() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/capa/ida/ida_rule_generator.py b/capa/ida/ida_rule_generator.py index f51b1900..ffa48376 100644 --- a/capa/ida/ida_rule_generator.py +++ b/capa/ida/ida_rule_generator.py @@ -19,10 +19,10 @@ from capa.ida import plugin_helpers import capa.features.extractors.ida.helpers -logger = logging.getLogger('rulegen') +logger = logging.getLogger("rulegen") -AUTHOR_NAME = '' +AUTHOR_NAME = "" COLOR_HIGHLIGHT = 0xD096FF @@ -35,11 +35,11 @@ def get_func_start(ea): class Hooks(idaapi.UI_Hooks): - ''' + """ Notifies the plugin when navigating to another function NOTE: it uses the global variable FLEX to access the PluginForm object. This looks nasty, maybe there is a better way? - ''' + """ def screen_ea_changed(self, ea, prev_ea): widget = idaapi.get_current_widget() @@ -55,14 +55,13 @@ class Hooks(idaapi.UI_Hooks): # changed to another function RULE_GEN_FORM.reload_features_tree() except Exception as e: - logger.warn('exception: %s', e) + logger.warn("exception: %s", e) class RuleGeneratorForm(idaapi.PluginForm): - def __init__(self): super(RuleGeneratorForm, self).__init__() - self.title = 'capa rule generator' + self.title = "capa rule generator" self.parent = None self.parent_items = {} @@ -70,7 +69,7 @@ class RuleGeneratorForm(idaapi.PluginForm): self.hooks = Hooks() # dirty? if self.hooks.hook(): - logger.info('UI notification hook installed successfully') + logger.info("UI notification hook installed successfully") def init_ui(self): self.tree = QTreeWidget() @@ -79,7 +78,7 @@ class RuleGeneratorForm(idaapi.PluginForm): self.reload_features_tree() - button_reset = QtWidgets.QPushButton('&Reset') + button_reset = QtWidgets.QPushButton("&Reset") button_reset.clicked.connect(self.reset) h_layout = QtWidgets.QHBoxLayout() @@ -96,7 +95,7 @@ class RuleGeneratorForm(idaapi.PluginForm): def reset(self): plugin_helpers.reset_selection(self.tree) plugin_helpers.reset_colors(self.orig_colors) - self.rule_text.setText('') + self.rule_text.setText("") def reload_features_tree(self): self.reset() @@ -119,7 +118,7 @@ class RuleGeneratorForm(idaapi.PluginForm): extractor = capa.features.extractors.ida.IdaFeatureExtractor() f = idaapi.get_func(idaapi.get_screen_ea()) if not f: - logger.info('function does not exist at 0x%x', idaapi.get_screen_ea()) + logger.info("function does not exist at 0x%x", idaapi.get_screen_ea()) return return self.extract_function_features(f) @@ -137,7 +136,7 @@ class RuleGeneratorForm(idaapi.PluginForm): def create_tree(self, features): self.tree.setMinimumWidth(400) # self.tree.setMinimumHeight(300) - self.tree.setHeaderLabels(['Feature', 'Virtual Address', 'Disassembly']) + self.tree.setHeaderLabels(["Feature", "Virtual Address", "Disassembly"]) # auto resize columns self.tree.header().setSectionResizeMode(QHeaderView.ResizeToContents) self.tree.itemClicked.connect(self.on_item_clicked) @@ -151,16 +150,22 @@ class RuleGeneratorForm(idaapi.PluginForm): # level 1 if feature not in self.parent_items: - self.parent_items[feature] = plugin_helpers.add_child_item(self.parent_items[type(feature)], [str(feature)]) + self.parent_items[feature] = plugin_helpers.add_child_item( + self.parent_items[type(feature)], [str(feature)] + ) # level n > 1 if len(vas) > 1: for va in sorted(vas): - plugin_helpers.add_child_item(self.parent_items[feature], [str(feature), '0x%X' % va, plugin_helpers.get_disasm_line(va)], feature) + plugin_helpers.add_child_item( + self.parent_items[feature], + [str(feature), "0x%X" % va, plugin_helpers.get_disasm_line(va)], + feature, + ) else: va = vas.pop() self.parent_items[feature].setText(0, str(feature)) - self.parent_items[feature].setText(1, '0x%X' % va) + self.parent_items[feature].setText(1, "0x%X" % va) self.parent_items[feature].setText(2, plugin_helpers.get_disasm_line(va)) self.parent_items[feature].setData(0, 0x100, feature) @@ -188,29 +193,31 @@ class RuleGeneratorForm(idaapi.PluginForm): def get_rule_from_features(self, features): rule_parts = [] - counted = zip(Counter(features).keys(), # equals to list(set(words)) - Counter(features).values()) # counts the elements' frequency + counted = zip( + Counter(features).keys(), Counter(features).values() # equals to list(set(words)) + ) # counts the elements' frequency # single features for k, v in filter(lambda t: t[1] == 1, counted): # TODO args to hex if int - if k.name.lower() == 'bytes': + if k.name.lower() == "bytes": # Convert raw bytes to uppercase hex representation (e.g., '12 34 56') upper_hex_bytes = binascii.hexlify(args_to_str(k.args)).upper() - rule_value_str = '' + rule_value_str = "" for i in range(0, len(upper_hex_bytes), 2): - rule_value_str += upper_hex_bytes[i:i + 2] + ' ' - r = ' - %s: %s' % (k.name.lower(), rule_value_str) + rule_value_str += upper_hex_bytes[i : i + 2] + " " + r = " - %s: %s" % (k.name.lower(), rule_value_str) else: - r = ' - %s: %s' % (k.name.lower(), args_to_str(k.args)) + r = " - %s: %s" % (k.name.lower(), args_to_str(k.args)) rule_parts.append(r) # counted features for k, v in filter(lambda t: t[1] > 1, counted): - r = ' - count(%s): %d' % (str(k), v) + r = " - count(%s): %d" % (str(k), v) rule_parts.append(r) - rule_prefix = textwrap.dedent(''' + rule_prefix = textwrap.dedent( + """ rule: meta: name: @@ -219,8 +226,10 @@ class RuleGeneratorForm(idaapi.PluginForm): examples: - %s:0x%X features: - ''' % (AUTHOR_NAME, idc.retrieve_input_file_md5(), get_func_start(idc.here()))).strip() - return '%s\n%s' % (rule_prefix, '\n'.join(sorted(rule_parts))) + """ + % (AUTHOR_NAME, idc.retrieve_input_file_md5(), get_func_start(idc.here())) + ).strip() + return "%s\n%s" % (rule_prefix, "\n".join(sorted(rule_parts))) # TODO merge into capa_idautils, get feature data def get_selected_items(self): @@ -242,26 +251,25 @@ class RuleGeneratorForm(idaapi.PluginForm): self.init_ui() def Show(self): - return idaapi.PluginForm.Show(self, self.title, options=( - idaapi.PluginForm.WOPN_RESTORE - | idaapi.PluginForm.WOPN_PERSIST - )) + return idaapi.PluginForm.Show( + self, self.title, options=(idaapi.PluginForm.WOPN_RESTORE | idaapi.PluginForm.WOPN_PERSIST) + ) def OnClose(self, form): self.reset() if self.hooks.unhook(): - logger.info('UI notification hook uninstalled successfully') - logger.info('RuleGeneratorForm closed') + logger.info("UI notification hook uninstalled successfully") + logger.info("RuleGeneratorForm closed") def args_to_str(args): a = [] for arg in args: if (isinstance(arg, int) or isinstance(arg, long)) and arg > 10: - a.append('0x%X' % arg) + a.append("0x%X" % arg) else: a.append(str(arg)) - return ','.join(a) + return ",".join(a) def main(): @@ -280,5 +288,5 @@ def main(): RULE_GEN_FORM.Show() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/capa/ida/plugin_helpers.py b/capa/ida/plugin_helpers.py index ab25beba..4dd4756b 100644 --- a/capa/ida/plugin_helpers.py +++ b/capa/ida/plugin_helpers.py @@ -8,34 +8,34 @@ import idc import idaapi -CAPA_EXTENSION = '.capas' +CAPA_EXTENSION = ".capas" -logger = logging.getLogger('capa_ida') +logger = logging.getLogger("capa_ida") def get_input_file(freeze=True): - ''' + """ get input file path freeze (bool): if True, get freeze file if it exists - ''' + """ # try original file in same directory as idb/i64 without idb/i64 file extension input_file = idc.get_idb_path()[:-4] if freeze: # use frozen file if it exists - freeze_file_cand = '%s%s' % (input_file, CAPA_EXTENSION) + freeze_file_cand = "%s%s" % (input_file, CAPA_EXTENSION) if os.path.isfile(freeze_file_cand): return freeze_file_cand if not os.path.isfile(input_file): # TM naming - input_file = '%s.mal_' % idc.get_idb_path()[:-4] + input_file = "%s.mal_" % idc.get_idb_path()[:-4] if not os.path.isfile(input_file): - input_file = idaapi.ask_file(0, '*.*', 'Please specify input file.') + input_file = idaapi.ask_file(0, "*.*", "Please specify input file.") if not input_file: - raise ValueError('could not find input file') + raise ValueError("could not find input file") return input_file diff --git a/capa/main.py b/capa/main.py index eee137ca..29a85e35 100644 --- a/capa/main.py +++ b/capa/main.py @@ -1,7 +1,7 @@ #!/usr/bin/env python2 -''' +""" capa - detect capabilities in programs. -''' +""" import os import os.path import sys @@ -23,16 +23,16 @@ import capa.features.extractors from capa.helpers import oint -SUPPORTED_FILE_MAGIC = set(['MZ']) +SUPPORTED_FILE_MAGIC = set(["MZ"]) -logger = logging.getLogger('capa') +logger = logging.getLogger("capa") def set_vivisect_log_level(level): - logging.getLogger('vivisect').setLevel(level) - logging.getLogger('vtrace').setLevel(level) - logging.getLogger('envi').setLevel(level) + logging.getLogger("vivisect").setLevel(level) + logging.getLogger("vtrace").setLevel(level) + logging.getLogger("envi").setLevel(level) def find_function_capabilities(ruleset, extractor, f): @@ -83,7 +83,7 @@ def find_file_capabilities(ruleset, extractor, function_features): if feature not in file_features: file_features[feature] = set() - logger.info('analyzed file and extracted %d features', len(file_features)) + logger.info("analyzed file and extracted %d features", len(file_features)) file_features.update(function_features) @@ -95,7 +95,7 @@ def find_capabilities(ruleset, extractor, disable_progress=None): all_function_matches = collections.defaultdict(list) all_bb_matches = collections.defaultdict(list) - for f in tqdm.tqdm(extractor.get_functions(), disable=disable_progress, unit=' functions'): + for f in tqdm.tqdm(extractor.get_functions(), disable=disable_progress, unit=" functions"): function_matches, bb_matches = find_function_capabilities(ruleset, extractor, f) for rule_name, res in function_matches.items(): all_function_matches[rule_name].extend(res) @@ -104,8 +104,10 @@ def find_capabilities(ruleset, extractor, disable_progress=None): # mapping from matched rule feature to set of addresses at which it matched. # type: Dict[MatchedRule, Set[int]] - function_features = {capa.features.MatchedRule(rule_name): set(map(lambda p: p[0], results)) - for rule_name, results in all_function_matches.items()} + function_features = { + capa.features.MatchedRule(rule_name): set(map(lambda p: p[0], results)) + for rule_name, results in all_function_matches.items() + } all_file_matches = find_file_capabilities(ruleset, extractor, function_features) @@ -119,7 +121,7 @@ def find_capabilities(ruleset, extractor, disable_progress=None): def has_rule_with_namespace(rules, capabilities, rule_cat): for rule_name in capabilities.keys(): - if rules.rules[rule_name].meta.get('namespace', '').startswith(rule_cat): + if rules.rules[rule_name].meta.get("namespace", "").startswith(rule_cat): return True return False @@ -128,61 +130,61 @@ def has_file_limitation(rules, capabilities, is_standalone=True): file_limitations = { # capa will likely detect installer specific functionality. # this is probably not what the user wants. - 'executable/installer': [ - ' This sample appears to be an installer.', - ' ', - ' capa cannot handle installers well. This means the results may be misleading or incomplete.' - ' You should try to understand the install mechanism and analyze created files with capa.' + "executable/installer": [ + " This sample appears to be an installer.", + " ", + " capa cannot handle installers well. This means the results may be misleading or incomplete." + " You should try to understand the install mechanism and analyze created files with capa.", ], # capa won't detect much in .NET samples. # it might match some file-level things. # for consistency, bail on things that we don't support. - 'runtime/dotnet': [ - ' This sample appears to be a .NET module.', - ' ', - ' .NET is a cross-platform framework for running managed applications.', - ' capa cannot handle non-native files. This means that the results may be misleading or incomplete.', - ' You may have to analyze the file manually, using a tool like the .NET decompiler dnSpy.' + "runtime/dotnet": [ + " This sample appears to be a .NET module.", + " ", + " .NET is a cross-platform framework for running managed applications.", + " capa cannot handle non-native files. This means that the results may be misleading or incomplete.", + " You may have to analyze the file manually, using a tool like the .NET decompiler dnSpy.", ], # capa will detect dozens of capabilities for AutoIt samples, # but these are due to the AutoIt runtime, not the payload script. # so, don't confuse the user with FP matches - bail instead - 'compiler/autoit': [ - ' This sample appears to be compiled with AutoIt.', - ' ', - ' AutoIt is a freeware BASIC-like scripting language designed for automating the Windows GUI.', - ' capa cannot handle AutoIt scripts. This means that the results will be misleading or incomplete.', - ' You may have to analyze the file manually, using a tool like the AutoIt decompiler MyAut2Exe.' + "compiler/autoit": [ + " This sample appears to be compiled with AutoIt.", + " ", + " AutoIt is a freeware BASIC-like scripting language designed for automating the Windows GUI.", + " capa cannot handle AutoIt scripts. This means that the results will be misleading or incomplete.", + " You may have to analyze the file manually, using a tool like the AutoIt decompiler MyAut2Exe.", ], # capa won't detect much in packed samples - 'anti-analysis/packer/': [ - ' This sample appears to be packed.', - ' ', - ' Packed samples have often been obfuscated to hide their logic.', - ' capa cannot handle obfuscation well. This means the results may be misleading or incomplete.', - ' If possible, you should try to unpack this input file before analyzing it with capa.' - ] + "anti-analysis/packer/": [ + " This sample appears to be packed.", + " ", + " Packed samples have often been obfuscated to hide their logic.", + " capa cannot handle obfuscation well. This means the results may be misleading or incomplete.", + " If possible, you should try to unpack this input file before analyzing it with capa.", + ], } for category, dialogue in file_limitations.items(): if not has_rule_with_namespace(rules, capabilities, category): continue - logger.warning('-' * 80) + logger.warning("-" * 80) for line in dialogue: logger.warning(line) if is_standalone: - logger.warning(' ') - logger.warning(' Use -v or -vv if you really want to see the capabilities identified by capa.') - logger.warning('-' * 80) + logger.warning(" ") + logger.warning(" Use -v or -vv if you really want to see the capabilities identified by capa.") + logger.warning("-" * 80) return True return False def is_supported_file_type(sample): - ''' + """ Return if this is a supported file based on magic header values - ''' - with open(sample, 'rb') as f: + """ + with open(sample, "rb") as f: magic = f.read(2) if magic in SUPPORTED_FILE_MAGIC: return True @@ -190,36 +192,37 @@ def is_supported_file_type(sample): return False -def get_shellcode_vw(sample, arch='auto'): - ''' +def get_shellcode_vw(sample, arch="auto"): + """ Return shellcode workspace using explicit arch or via auto detect - ''' + """ import viv_utils - with open(sample, 'rb') as f: + + with open(sample, "rb") as f: sample_bytes = f.read() - if arch == 'auto': + if arch == "auto": # choose arch with most functions, idea by Jay G. vw_cands = [] - for arch in ['i386', 'amd64']: + for arch in ["i386", "amd64"]: vw_cands.append(viv_utils.getShellcodeWorkspace(sample_bytes, arch)) if not vw_cands: - raise ValueError('could not generate vivisect workspace') + raise ValueError("could not generate vivisect workspace") vw = max(vw_cands, key=lambda vw: len(vw.getFunctions())) else: vw = viv_utils.getShellcodeWorkspace(sample_bytes, arch) - vw.setMeta('Format', 'blob') # TODO fix in viv_utils + vw.setMeta("Format", "blob") # TODO fix in viv_utils return vw def get_meta_str(vw): - ''' + """ Return workspace meta information string - ''' + """ meta = [] - for k in ['Format', 'Platform', 'Architecture']: + for k in ["Format", "Platform", "Architecture"]: if k in vw.metadata: - meta.append('%s: %s' % (k.lower(), vw.metadata[k])) - return '%s, number of functions: %d' % (', '.join(meta), len(vw.getFunctions())) + meta.append("%s: %s" % (k.lower(), vw.metadata[k])) + return "%s, number of functions: %d" % (", ".join(meta), len(vw.getFunctions())) class UnsupportedFormatError(ValueError): @@ -228,23 +231,25 @@ class UnsupportedFormatError(ValueError): def get_workspace(path, format): import viv_utils - logger.info('generating vivisect workspace for: %s', path) - if format == 'auto': + + logger.info("generating vivisect workspace for: %s", path) + if format == "auto": if not is_supported_file_type(path): raise UnsupportedFormatError() vw = viv_utils.getWorkspace(path) - elif format == 'pe': + elif format == "pe": vw = viv_utils.getWorkspace(path) - elif format == 'sc32': - vw = get_shellcode_vw(path, arch='i386') - elif format == 'sc64': - vw = get_shellcode_vw(path, arch='amd64') - logger.info('%s', get_meta_str(vw)) + elif format == "sc32": + vw = get_shellcode_vw(path, arch="i386") + elif format == "sc64": + vw = get_shellcode_vw(path, arch="amd64") + logger.info("%s", get_meta_str(vw)) return vw def get_extractor_py2(path, format): import capa.features.extractors.viv + vw = get_workspace(path, format) return capa.features.extractors.viv.VivisectFeatureExtractor(vw, path) @@ -258,10 +263,10 @@ def get_extractor_py3(path, format): def get_extractor(path, format): - ''' + """ raises: UnsupportedFormatError: - ''' + """ if sys.version_info >= (3, 0): return get_extractor_py3(path, format) else: @@ -269,7 +274,7 @@ def get_extractor(path, format): def is_nursery_rule_path(path): - ''' + """ The nursery is a spot for rules that have not yet been fully polished. For example, they may not have references to public example of a technique. Yet, we still want to capture and report on their matches. @@ -277,23 +282,23 @@ def is_nursery_rule_path(path): When nursery rules are loaded, their metadata section should be updated with: `nursery=True`. - ''' - return 'nursery' in path + """ + return "nursery" in path def get_rules(rule_path): if not os.path.exists(rule_path): - raise IOError('%s does not exist or cannot be accessed' % rule_path) + raise IOError("%s does not exist or cannot be accessed" % rule_path) rule_paths = [] if os.path.isfile(rule_path): rule_paths.append(rule_path) elif os.path.isdir(rule_path): - logger.debug('reading rules from directory %s', rule_path) + logger.debug("reading rules from directory %s", rule_path) for root, dirs, files in os.walk(rule_path): for file in files: - if not file.endswith('.yml'): - logger.warning('skipping non-.yml file: %s', file) + if not file.endswith(".yml"): + logger.warning("skipping non-.yml file: %s", file) continue rule_path = os.path.join(root, file) @@ -301,18 +306,18 @@ def get_rules(rule_path): rules = [] for rule_path in rule_paths: - logger.debug('reading rule file: %s', rule_path) + logger.debug("reading rule file: %s", rule_path) try: rule = capa.rules.Rule.from_yaml_file(rule_path) except capa.rules.InvalidRule: raise else: - rule.meta['capa/path'] = rule_path + rule.meta["capa/path"] = rule_path if is_nursery_rule_path(rule_path): - rule.meta['capa/nursery'] = True + rule.meta["capa/nursery"] = True rules.append(rule) - logger.debug('rule: %s scope: %s', rule.name, rule.scope) + logger.debug("rule: %s scope: %s", rule.name, rule.scope) return rules @@ -322,35 +327,37 @@ def main(argv=None): argv = sys.argv[1:] formats = [ - ('auto', '(default) detect file type automatically'), - ('pe', 'Windows PE file'), - ('sc32', '32-bit shellcode'), - ('sc64', '64-bit shellcode'), - ('freeze', 'features previously frozen by capa'), + ("auto", "(default) detect file type automatically"), + ("pe", "Windows PE file"), + ("sc32", "32-bit shellcode"), + ("sc64", "64-bit shellcode"), + ("freeze", "features previously frozen by capa"), ] - format_help = ', '.join(['%s: %s' % (f[0], f[1]) for f in formats]) + format_help = ", ".join(["%s: %s" % (f[0], f[1]) for f in formats]) - parser = argparse.ArgumentParser(description='detect capabilities in programs.') - parser.add_argument('sample', type=str, - help='Path to sample to analyze') - parser.add_argument('-r', '--rules', type=str, default='(embedded rules)', - help='Path to rule file or directory, use embedded rules by default') - parser.add_argument('-t', '--tag', type=str, - help='Filter on rule meta field values') - parser.add_argument('--version', action='store_true', - help='Print the executable version and exit') - parser.add_argument('-j', '--json', action='store_true', - help='Emit JSON instead of text') - parser.add_argument('-v', '--verbose', action='store_true', - help='Enable verbose result document (no effect with --json)') - parser.add_argument('-vv', '--vverbose', action='store_true', - help='Enable very verbose result document (no effect with --json)') - parser.add_argument('-d', '--debug', action='store_true', - help='Enable debugging output on STDERR') - parser.add_argument('-q', '--quiet', action='store_true', - help='Disable all output but errors') - parser.add_argument('-f', '--format', choices=[f[0] for f in formats], default='auto', - help='Select sample format, %s' % format_help) + parser = argparse.ArgumentParser(description="detect capabilities in programs.") + parser.add_argument("sample", type=str, help="Path to sample to analyze") + parser.add_argument( + "-r", + "--rules", + type=str, + default="(embedded rules)", + help="Path to rule file or directory, use embedded rules by default", + ) + parser.add_argument("-t", "--tag", type=str, help="Filter on rule meta field values") + parser.add_argument("--version", action="store_true", help="Print the executable version and exit") + parser.add_argument("-j", "--json", action="store_true", help="Emit JSON instead of text") + parser.add_argument( + "-v", "--verbose", action="store_true", help="Enable verbose result document (no effect with --json)" + ) + parser.add_argument( + "-vv", "--vverbose", action="store_true", help="Enable very verbose result document (no effect with --json)" + ) + parser.add_argument("-d", "--debug", action="store_true", help="Enable debugging output on STDERR") + parser.add_argument("-q", "--quiet", action="store_true", help="Disable all output but errors") + parser.add_argument( + "-f", "--format", choices=[f[0] for f in formats], default="auto", help="Select sample format, %s" % format_help + ) args = parser.parse_args(args=argv) if args.version: @@ -375,68 +382,70 @@ def main(argv=None): # because cp65001 is utf-8, we just map that codepage to the utf-8 codec. # see #380 and: https://stackoverflow.com/a/3259271/87207 import codecs - codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) - if args.rules == '(embedded rules)': - logger.info('-' * 80) - logger.info(' Using default embedded rules.') - logger.info(' To provide your own rules, use the form `capa.exe ./path/to/rules/ /path/to/mal.exe`.') - logger.info(' You can see the current default rule set here:') - logger.info(' https://github.com/fireeye/capa-rules') - logger.info('-' * 80) + codecs.register(lambda name: codecs.lookup("utf-8") if name == "cp65001" else None) - if hasattr(sys, 'frozen') and hasattr(sys, '_MEIPASS'): - logger.debug('detected running under PyInstaller') - args.rules = os.path.join(sys._MEIPASS, 'rules') - logger.debug('default rule path (PyInstaller method): %s', args.rules) + if args.rules == "(embedded rules)": + logger.info("-" * 80) + logger.info(" Using default embedded rules.") + logger.info(" To provide your own rules, use the form `capa.exe ./path/to/rules/ /path/to/mal.exe`.") + logger.info(" You can see the current default rule set here:") + logger.info(" https://github.com/fireeye/capa-rules") + logger.info("-" * 80) + + if hasattr(sys, "frozen") and hasattr(sys, "_MEIPASS"): + logger.debug("detected running under PyInstaller") + args.rules = os.path.join(sys._MEIPASS, "rules") + logger.debug("default rule path (PyInstaller method): %s", args.rules) else: - logger.debug('detected running from source') - args.rules = os.path.join(os.path.dirname(__file__), '..', 'rules') - logger.debug('default rule path (source method): %s', args.rules) + logger.debug("detected running from source") + args.rules = os.path.join(os.path.dirname(__file__), "..", "rules") + logger.debug("default rule path (source method): %s", args.rules) else: - logger.info('using rules path: %s', args.rules) + logger.info("using rules path: %s", args.rules) try: rules = get_rules(args.rules) rules = capa.rules.RuleSet(rules) - logger.info('successfully loaded %s rules', len(rules)) + logger.info("successfully loaded %s rules", len(rules)) if args.tag: rules = rules.filter_rules_by_meta(args.tag) - logger.info('selected %s rules', len(rules)) + logger.info("selected %s rules", len(rules)) for i, r in enumerate(rules.rules, 1): # TODO don't display subscope rules? - logger.debug(' %d. %s', i, r) + logger.debug(" %d. %s", i, r) except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: - logger.error('%s', str(e)) + logger.error("%s", str(e)) return -1 - with open(args.sample, 'rb') as f: + with open(args.sample, "rb") as f: taste = f.read(8) - if ((args.format == 'freeze') - or (args.format == 'auto' and capa.features.freeze.is_freeze(taste))): - with open(args.sample, 'rb') as f: + if (args.format == "freeze") or (args.format == "auto" and capa.features.freeze.is_freeze(taste)): + with open(args.sample, "rb") as f: extractor = capa.features.freeze.load(f.read()) else: try: extractor = get_extractor(args.sample, args.format) except UnsupportedFormatError: - logger.error('-' * 80) - logger.error(' Input file does not appear to be a PE file.') - logger.error(' ') - logger.error(' capa currently only supports analyzing PE files (or shellcode, when using --format sc32|sc64).') - logger.error(' If you don\'t know the input file type, you can try using the `file` utility to guess it.') - logger.error('-' * 80) + logger.error("-" * 80) + logger.error(" Input file does not appear to be a PE file.") + logger.error(" ") + logger.error( + " capa currently only supports analyzing PE files (or shellcode, when using --format sc32|sc64)." + ) + logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.") + logger.error("-" * 80) return -1 except UnsupportedRuntimeError: - logger.error('-' * 80) - logger.error(' Unsupported runtime or Python interpreter.') - logger.error(' ') - logger.error(' capa supports running under Python 2.7 using Vivisect for binary analysis.') - logger.error(' It can also run within IDA Pro, using either Python 2.7 or 3.5+.') - logger.error(' ') - logger.error(' If you\'re seeing this message on the command line, please ensure you\'re running Python 2.7.') - logger.error('-' * 80) + logger.error("-" * 80) + logger.error(" Unsupported runtime or Python interpreter.") + logger.error(" ") + logger.error(" capa supports running under Python 2.7 using Vivisect for binary analysis.") + logger.error(" It can also run within IDA Pro, using either Python 2.7 or 3.5+.") + logger.error(" ") + logger.error(" If you're seeing this message on the command line, please ensure you're running Python 2.7.") + logger.error("-" * 80) return -1 capabilities = find_capabilities(rules, extractor) @@ -462,7 +471,7 @@ def main(argv=None): print(capa.render.render_default(rules, capabilities)) colorama.deinit() - logger.info('done.') + logger.info("done.") return 0 @@ -472,34 +481,37 @@ def ida_main(): logging.getLogger().setLevel(logging.INFO) import capa.ida.helpers + if not capa.ida.helpers.is_supported_file_type(): return -1 - logger.info('-' * 80) - logger.info(' Using default embedded rules.') - logger.info(' ') - logger.info(' You can see the current default rule set here:') - logger.info(' https://github.com/fireeye/capa-rules') - logger.info('-' * 80) + logger.info("-" * 80) + logger.info(" Using default embedded rules.") + logger.info(" ") + logger.info(" You can see the current default rule set here:") + logger.info(" https://github.com/fireeye/capa-rules") + logger.info("-" * 80) - if hasattr(sys, 'frozen') and hasattr(sys, '_MEIPASS'): - logger.debug('detected running under PyInstaller') - rules_path = os.path.join(sys._MEIPASS, 'rules') - logger.debug('default rule path (PyInstaller method): %s', rules_path) + if hasattr(sys, "frozen") and hasattr(sys, "_MEIPASS"): + logger.debug("detected running under PyInstaller") + rules_path = os.path.join(sys._MEIPASS, "rules") + logger.debug("default rule path (PyInstaller method): %s", rules_path) else: - logger.debug('detected running from source') - rules_path = os.path.join(os.path.dirname(__file__), '..', 'rules') - logger.debug('default rule path (source method): %s', rules_path) + logger.debug("detected running from source") + rules_path = os.path.join(os.path.dirname(__file__), "..", "rules") + logger.debug("default rule path (source method): %s", rules_path) rules = get_rules(rules_path) import capa.rules + rules = capa.rules.RuleSet(rules) import capa.features.extractors.ida + capabilities = find_capabilities(rules, capa.features.extractors.ida.IdaFeatureExtractor()) if has_file_limitation(rules, capabilities, is_standalone=False): - capa.ida.helpers.inform_user_ida_ui('capa encountered warnings during analysis') + capa.ida.helpers.inform_user_ida_ui("capa encountered warnings during analysis") render_capabilities_default(rules, capabilities) @@ -513,7 +525,7 @@ def is_runtime_ida(): return True -if __name__ == '__main__': +if __name__ == "__main__": if is_runtime_ida(): ida_main() else: diff --git a/capa/render/__init__.py b/capa/render/__init__.py index cafa7aac..1f8dbb55 100644 --- a/capa/render/__init__.py +++ b/capa/render/__init__.py @@ -18,43 +18,41 @@ def convert_statement_to_result_document(statement): """ if isinstance(statement, capa.engine.And): return { - 'type': 'and', + "type": "and", } elif isinstance(statement, capa.engine.Or): return { - 'type': 'or', + "type": "or", } elif isinstance(statement, capa.engine.Not): return { - 'type': 'not', + "type": "not", } elif isinstance(statement, capa.engine.Some) and statement.count == 0: - return { - 'type': 'optional' - } + return {"type": "optional"} elif isinstance(statement, capa.engine.Some) and statement.count > 0: return { - 'type': 'some', - 'count': statement.count, + "type": "some", + "count": statement.count, } elif isinstance(statement, capa.engine.Range): return { - 'type': 'range', - 'min': statement.min, - 'max': statement.max, - 'child': convert_feature_to_result_document(statement.child), + "type": "range", + "min": statement.min, + "max": statement.max, + "child": convert_feature_to_result_document(statement.child), } elif isinstance(statement, capa.engine.Regex): return { - 'type': 'regex', - 'pattern': statement.pattern, + "type": "regex", + "pattern": statement.pattern, # the string that was matched - 'match': statement.match, + "match": statement.match, } elif isinstance(statement, capa.engine.Subscope): return { - 'type': 'subscope', - 'subscope': statement.scope, + "type": "subscope", + "subscope": statement.scope, } else: raise RuntimeError("unexpected match statement type: " + str(statement)) @@ -89,8 +87,8 @@ def convert_feature_to_result_document(feature): # make the terms pretty name = name.lower() - if name == 'matchedrule': - name = 'match' + if name == "matchedrule": + name = "match" # in the common case, there's a single argument # so use it directly. @@ -99,7 +97,7 @@ def convert_feature_to_result_document(feature): value = value[0] return { - 'type': name, + "type": name, name: value, } @@ -119,13 +117,13 @@ def convert_node_to_result_document(node): if isinstance(node, capa.engine.Statement): return { - 'type': 'statement', - 'statement': convert_statement_to_result_document(node), + "type": "statement", + "statement": convert_statement_to_result_document(node), } elif isinstance(node, capa.features.Feature): return { - 'type': 'feature', - 'feature': convert_feature_to_result_document(node), + "type": "feature", + "feature": convert_feature_to_result_document(node), } else: raise RuntimeError("unexpected match node type") @@ -137,19 +135,16 @@ def convert_match_to_result_document(rules, capabilities, result): this will become part of the "result document" format that can be emitted to JSON. """ doc = { - 'success': bool(result.success), - 'node': convert_node_to_result_document(result.statement), - 'children': [ - convert_match_to_result_document(rules, capabilities, child) - for child in result.children - ], + "success": bool(result.success), + "node": convert_node_to_result_document(result.statement), + "children": [convert_match_to_result_document(rules, capabilities, child) for child in result.children], } # logic expression, like `and`, don't have locations - their children do. # so only add `locations` to feature nodes. if isinstance(result.statement, capa.features.Feature): if bool(result.success): - doc['locations'] = result.locations + doc["locations"] = result.locations # if we have a `match` statement, then we're referencing another rule. # this could an external rule (written by a human), or @@ -159,31 +154,30 @@ def convert_match_to_result_document(rules, capabilities, result): # so, we need to lookup the other rule results # and then filter those down to the address used here. # finally, splice that logic into this tree. - if (doc['node']['type'] == 'feature' - and doc['node']['feature']['type'] == 'match' - # only add subtree on success, - # because there won't be results for the other rule on failure. - and doc['success']): + if ( + doc["node"]["type"] == "feature" + and doc["node"]["feature"]["type"] == "match" + # only add subtree on success, + # because there won't be results for the other rule on failure. + and doc["success"] + ): - rule_name = doc['node']['feature']['match'] + rule_name = doc["node"]["feature"]["match"] rule = rules[rule_name] rule_matches = {address: result for (address, result) in capabilities[rule_name]} - if rule.meta.get('capa/subscope-rule'): + if rule.meta.get("capa/subscope-rule"): # for a subscope rule, fixup the node to be a scope node, rather than a match feature node. # # e.g. `contain loop/30c4c78e29bf4d54894fc74f664c62e8` -> `basic block` - scope = rule.meta['scope'] - doc['node'] = { - 'type': 'statement', - 'statement': { - 'type': 'subscope', - 'subscope': scope, - }, + scope = rule.meta["scope"] + doc["node"] = { + "type": "statement", + "statement": {"type": "subscope", "subscope": scope,}, } - for location in doc['locations']: - doc['children'].append(convert_match_to_result_document(rules, capabilities, rule_matches[location])) + for location in doc["locations"]: + doc["children"].append(convert_match_to_result_document(rules, capabilities, rule_matches[location])) return doc @@ -220,15 +214,14 @@ def convert_capabilities_to_result_document(rules, capabilities): for rule_name, matches in capabilities.items(): rule = rules[rule_name] - if rule.meta.get('capa/subscope-rule'): + if rule.meta.get("capa/subscope-rule"): continue doc[rule_name] = { - 'meta': dict(rule.meta), - 'source': rule.definition, - 'matches': { - addr: convert_match_to_result_document(rules, capabilities, match) - for (addr, match) in matches + "meta": dict(rule.meta), + "source": rule.definition, + "matches": { + addr: convert_match_to_result_document(rules, capabilities, match) for (addr, match) in matches }, } @@ -241,6 +234,7 @@ def render_vverbose(rules, capabilities): # and capa.render.vverbose import capa.render (implicitly, as a submodule) # so, defer the import until routine is called, breaking the import loop. import capa.render.vverbose + doc = convert_capabilities_to_result_document(rules, capabilities) return capa.render.vverbose.render_vverbose(doc) @@ -248,6 +242,7 @@ def render_vverbose(rules, capabilities): def render_verbose(rules, capabilities): # break import loop import capa.render.verbose + doc = convert_capabilities_to_result_document(rules, capabilities) return capa.render.verbose.render_verbose(doc) @@ -256,6 +251,7 @@ def render_default(rules, capabilities): # break import loop import capa.render.verbose import capa.render.default + doc = convert_capabilities_to_result_document(rules, capabilities) return capa.render.default.render_default(doc) @@ -273,7 +269,5 @@ class CapaJsonObjectEncoder(json.JSONEncoder): def render_json(rules, capabilities): return json.dumps( - convert_capabilities_to_result_document(rules, capabilities), - cls=CapaJsonObjectEncoder, - sort_keys=True, + convert_capabilities_to_result_document(rules, capabilities), cls=CapaJsonObjectEncoder, sort_keys=True, ) diff --git a/capa/render/default.py b/capa/render/default.py index 978eb1cc..fbb0fb12 100644 --- a/capa/render/default.py +++ b/capa/render/default.py @@ -9,7 +9,7 @@ import capa.render.utils as rutils def width(s, character_count): """pad the given string to at least `character_count`""" if len(s) < character_count: - return s + ' ' * (character_count - len(s)) + return s + " " * (character_count - len(s)) else: return s @@ -28,15 +28,15 @@ def render_capabilities(doc, ostream): """ rows = [] for rule in rutils.capability_rules(doc): - count = len(rule['matches']) + count = len(rule["matches"]) if count == 1: - capability = rutils.bold(rule['meta']['name']) + capability = rutils.bold(rule["meta"]["name"]) else: - capability = '%s (%d matches)' % (rutils.bold(rule['meta']['name']), count) - rows.append((capability, rule['meta']['namespace'])) + capability = "%s (%d matches)" % (rutils.bold(rule["meta"]["name"]), count) + rows.append((capability, rule["meta"]["namespace"])) - ostream.write(tabulate.tabulate(rows, headers=[width('CAPABILITY', 40), width('NAMESPACE', 40)], tablefmt='psql')) - ostream.write('\n') + ostream.write(tabulate.tabulate(rows, headers=[width("CAPABILITY", 40), width("NAMESPACE", 40)], tablefmt="psql")) + ostream.write("\n") def render_attack(doc, ostream): @@ -57,17 +57,17 @@ def render_attack(doc, ostream): """ tactics = collections.defaultdict(set) for rule in rutils.capability_rules(doc): - if not rule['meta'].get('att&ck'): + if not rule["meta"].get("att&ck"): continue - for attack in rule['meta']['att&ck']: - tactic, _, rest = attack.partition('::') - if '::' in rest: - technique, _, rest = rest.partition('::') - subtechnique, _, id = rest.rpartition(' ') + for attack in rule["meta"]["att&ck"]: + tactic, _, rest = attack.partition("::") + if "::" in rest: + technique, _, rest = rest.partition("::") + subtechnique, _, id = rest.rpartition(" ") tactics[tactic].add((technique, subtechnique, id)) else: - technique, _, id = rest.rpartition(' ') + technique, _, id = rest.rpartition(" ") tactics[tactic].add((technique, id)) rows = [] @@ -76,15 +76,17 @@ def render_attack(doc, ostream): for spec in sorted(techniques): if len(spec) == 2: technique, id = spec - inner_rows.append('%s %s' % (rutils.bold(technique), id)) + inner_rows.append("%s %s" % (rutils.bold(technique), id)) elif len(spec) == 3: technique, subtechnique, id = spec - inner_rows.append('%s::%s %s' % (rutils.bold(technique), subtechnique, id)) + inner_rows.append("%s::%s %s" % (rutils.bold(technique), subtechnique, id)) else: - raise RuntimeError('unexpected ATT&CK spec format') - rows.append((rutils.bold(tactic.upper()), '\n'.join(inner_rows), )) - ostream.write(tabulate.tabulate(rows, headers=[width('ATT&CK Tactic', 20), width('ATT&CK Technique', 60)], tablefmt='psql')) - ostream.write('\n') + raise RuntimeError("unexpected ATT&CK spec format") + rows.append((rutils.bold(tactic.upper()), "\n".join(inner_rows),)) + ostream.write( + tabulate.tabulate(rows, headers=[width("ATT&CK Tactic", 20), width("ATT&CK Technique", 60)], tablefmt="psql") + ) + ostream.write("\n") def render_default(doc): diff --git a/capa/render/utils.py b/capa/render/utils.py index 712f0c77..91d88293 100644 --- a/capa/render/utils.py +++ b/capa/render/utils.py @@ -4,38 +4,40 @@ import termcolor def bold(s): """draw attention to the given string""" - return termcolor.colored(s, 'blue') + return termcolor.colored(s, "blue") def bold2(s): """draw attention to the given string, within a `bold` section""" - return termcolor.colored(s, 'green') + return termcolor.colored(s, "green") def hex(n): """render the given number using upper case hex, like: 0x123ABC""" - return '0x%X' % n + return "0x%X" % n def hex_string(h): """ render hex string e.g. "0a40b1" as "0A 40 B1" """ - return ' '.join(h[i:i + 2] for i in range(0, len(h), 2)).upper() + return " ".join(h[i : i + 2] for i in range(0, len(h), 2)).upper() def capability_rules(doc): """enumerate the rules in (namespace, name) order that are 'capability' rules (not lib/subscope/disposition/etc).""" - for (_, _, rule) in sorted(map(lambda rule: (rule['meta'].get('namespace', ''), rule['meta']['name'], rule), doc.values())): - if rule['meta'].get('lib'): + for (_, _, rule) in sorted( + map(lambda rule: (rule["meta"].get("namespace", ""), rule["meta"]["name"], rule), doc.values()) + ): + if rule["meta"].get("lib"): continue - if rule['meta'].get('capa/subscope'): + if rule["meta"].get("capa/subscope"): continue - if rule['meta'].get('maec/analysis-conclusion'): + if rule["meta"].get("maec/analysis-conclusion"): continue - if rule['meta'].get('maec/analysis-conclusion-ov'): + if rule["meta"].get("maec/analysis-conclusion-ov"): continue - if rule['meta'].get('maec/malware-category'): + if rule["meta"].get("maec/malware-category"): continue - if rule['meta'].get('maec/malware-category-ov'): + if rule["meta"].get("maec/malware-category-ov"): continue yield rule @@ -44,4 +46,4 @@ def capability_rules(doc): class StringIO(six.StringIO): def writeln(self, s): self.write(s) - self.write('\n') + self.write("\n") diff --git a/capa/render/verbose.py b/capa/render/verbose.py index 95261f1a..ed3bc350 100644 --- a/capa/render/verbose.py +++ b/capa/render/verbose.py @@ -24,29 +24,29 @@ def render_verbose(doc): ostream = rutils.StringIO() for rule in rutils.capability_rules(doc): - count = len(rule['matches']) + count = len(rule["matches"]) if count == 1: - capability = rutils.bold(rule['meta']['name']) + capability = rutils.bold(rule["meta"]["name"]) else: - capability = '%s (%d matches)' % (rutils.bold(rule['meta']['name']), count) + capability = "%s (%d matches)" % (rutils.bold(rule["meta"]["name"]), count) ostream.writeln(capability) rows = [] - for key in ('namespace', 'description', 'scope'): - if key == 'name' or key not in rule['meta']: + for key in ("namespace", "description", "scope"): + if key == "name" or key not in rule["meta"]: continue - v = rule['meta'][key] + v = rule["meta"][key] if isinstance(v, list) and len(v) == 1: v = v[0] rows.append((key, v)) - if rule['meta']['scope'] != capa.rules.FILE_SCOPE: - locations = doc[rule['meta']['name']]['matches'].keys() - rows.append(('matches', '\n'.join(map(rutils.hex, locations)))) + if rule["meta"]["scope"] != capa.rules.FILE_SCOPE: + locations = doc[rule["meta"]["name"]]["matches"].keys() + rows.append(("matches", "\n".join(map(rutils.hex, locations)))) - ostream.writeln(tabulate.tabulate(rows, tablefmt='plain')) - ostream.write('\n') + ostream.writeln(tabulate.tabulate(rows, tablefmt="plain")) + ostream.write("\n") return ostream.getvalue() diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index e59a9596..7dd8174b 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -5,145 +5,147 @@ import capa.render.utils as rutils def render_statement(ostream, statement, indent=0): - ostream.write(' ' * indent) - if statement['type'] in ('and', 'or', 'optional'): - ostream.write(statement['type']) - ostream.writeln(':') - elif statement['type'] == 'not': + ostream.write(" " * indent) + if statement["type"] in ("and", "or", "optional"): + ostream.write(statement["type"]) + ostream.writeln(":") + elif statement["type"] == "not": # this statement is handled specially in `render_match` using the MODE_SUCCESS/MODE_FAILURE flags. - ostream.writeln('not:') - elif statement['type'] == 'some': - ostream.write(statement['count'] + ' or more') - ostream.writeln(':') - elif statement['type'] == 'range': + ostream.writeln("not:") + elif statement["type"] == "some": + ostream.write(statement["count"] + " or more") + ostream.writeln(":") + elif statement["type"] == "range": # `range` is a weird node, its almost a hybrid of statement+feature. # it is a specific feature repeated multiple times. # there's no additional logic in the feature part, just the existence of a feature. # so, we have to inline some of the feature rendering here. - child = statement['child'] - if child['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match'): - feature = '%s(%s)' % (child['type'], rutils.bold2(child[child['type']])) - elif child['type'] in ('number', 'offset'): - feature = '%s(%s)' % (child['type'], rutils.bold2(rutils.hex(child[child['type']]))) - elif child['type'] == 'bytes': - feature = '%s(%s)' % (child['type'], rutils.bold2(rutils.hex_string(child[child['type']]))) - elif child['type'] == 'characteristic': - feature = 'characteristic(%s)' % (rutils.bold2(child['characteristic'][0])) + child = statement["child"] + if child["type"] in ("string", "api", "mnemonic", "basic block", "export", "import", "section", "match"): + feature = "%s(%s)" % (child["type"], rutils.bold2(child[child["type"]])) + elif child["type"] in ("number", "offset"): + feature = "%s(%s)" % (child["type"], rutils.bold2(rutils.hex(child[child["type"]]))) + elif child["type"] == "bytes": + feature = "%s(%s)" % (child["type"], rutils.bold2(rutils.hex_string(child[child["type"]]))) + elif child["type"] == "characteristic": + feature = "characteristic(%s)" % (rutils.bold2(child["characteristic"][0])) else: - raise RuntimeError('unexpected feature type: ' + str(child)) + raise RuntimeError("unexpected feature type: " + str(child)) - ostream.write('count(%s): ' % feature) + ostream.write("count(%s): " % feature) - if statement['max'] == statement['min']: - ostream.writeln('%d' % (statement['min'])) - elif statement['min'] == 0: - ostream.writeln('%d or fewer' % (statement['max'])) - elif statement['max'] == (1 << 64 - 1): - ostream.writeln('%d or more' % (statement['min'])) + if statement["max"] == statement["min"]: + ostream.writeln("%d" % (statement["min"])) + elif statement["min"] == 0: + ostream.writeln("%d or fewer" % (statement["max"])) + elif statement["max"] == (1 << 64 - 1): + ostream.writeln("%d or more" % (statement["min"])) else: - ostream.writeln('between %d and %d' % (statement['min'], statement['max'])) - elif statement['type'] == 'subscope': - ostream.write(statement['subscope']) - ostream.writeln(':') - elif statement['type'] == 'regex': + ostream.writeln("between %d and %d" % (statement["min"], statement["max"])) + elif statement["type"] == "subscope": + ostream.write(statement["subscope"]) + ostream.writeln(":") + elif statement["type"] == "regex": # regex is a `Statement` not a `Feature` # this is because it doesn't get extracted, but applies to all strings in scope. # so we have to handle it here - ostream.writeln('string: %s' % (statement['match'])) + ostream.writeln("string: %s" % (statement["match"])) else: raise RuntimeError("unexpected match statement type: " + str(statement)) def render_feature(ostream, match, feature, indent=0): - ostream.write(' ' * indent) + ostream.write(" " * indent) - if feature['type'] in ('string', 'api', 'mnemonic', 'basic block', 'export', 'import', 'section', 'match'): - ostream.write(feature['type']) - ostream.write(': ') - ostream.write(rutils.bold2(feature[feature['type']])) - elif feature['type'] in ('number', 'offset'): - ostream.write(feature['type']) - ostream.write(': ') - ostream.write(rutils.bold2(rutils.hex(feature[feature['type']]))) - elif feature['type'] == 'bytes': - ostream.write('bytes: ') + if feature["type"] in ("string", "api", "mnemonic", "basic block", "export", "import", "section", "match"): + ostream.write(feature["type"]) + ostream.write(": ") + ostream.write(rutils.bold2(feature[feature["type"]])) + elif feature["type"] in ("number", "offset"): + ostream.write(feature["type"]) + ostream.write(": ") + ostream.write(rutils.bold2(rutils.hex(feature[feature["type"]]))) + elif feature["type"] == "bytes": + ostream.write("bytes: ") # bytes is the uppercase, hex-encoded string. # it should always be an even number of characters (its hex). - ostream.write(rutils.bold2(rutils.hex_string(feature[feature['type']]))) - elif feature['type'] == 'characteristic': - ostream.write('characteristic(%s)' % (rutils.bold2(feature['characteristic'][0]))) + ostream.write(rutils.bold2(rutils.hex_string(feature[feature["type"]]))) + elif feature["type"] == "characteristic": + ostream.write("characteristic(%s)" % (rutils.bold2(feature["characteristic"][0]))) # note that regex is found in `render_statement` else: - raise RuntimeError('unexpected feature type: ' + str(feature)) + raise RuntimeError("unexpected feature type: " + str(feature)) # its possible to have an empty locations array here, # such as when we're in MODE_FAILURE and showing the logic # under a `not` statement (which will have no matched locations). - locations = list(sorted(match.get('locations', []))) + locations = list(sorted(match.get("locations", []))) if len(locations) == 1: - ostream.write(' @ ') + ostream.write(" @ ") ostream.write(rutils.hex(locations[0])) elif len(locations) > 1: - ostream.write(' @ ') + ostream.write(" @ ") if len(locations) > 4: # don't display too many locations, because it becomes very noisy. # probably only the first handful of locations will be useful for inspection. - ostream.write(', '.join(map(rutils.hex, locations[0:4]))) - ostream.write(', and %d more...' % (len(locations) - 4)) + ostream.write(", ".join(map(rutils.hex, locations[0:4]))) + ostream.write(", and %d more..." % (len(locations) - 4)) else: - ostream.write(', '.join(map(rutils.hex, locations))) + ostream.write(", ".join(map(rutils.hex, locations))) - ostream.write('\n') + ostream.write("\n") def render_node(ostream, match, node, indent=0): - if node['type'] == 'statement': - render_statement(ostream, node['statement'], indent=indent) - elif node['type'] == 'feature': - render_feature(ostream, match, node['feature'], indent=indent) + if node["type"] == "statement": + render_statement(ostream, node["statement"], indent=indent) + elif node["type"] == "feature": + render_feature(ostream, match, node["feature"], indent=indent) else: - raise RuntimeError('unexpected node type: ' + str(node)) + raise RuntimeError("unexpected node type: " + str(node)) # display nodes that successfully evaluated against the sample. -MODE_SUCCESS = 'success' +MODE_SUCCESS = "success" # display nodes that did not evaluate to True against the sample. # this is useful when rendering the logic tree under a `not` node. -MODE_FAILURE = 'failure' +MODE_FAILURE = "failure" def render_match(ostream, match, indent=0, mode=MODE_SUCCESS): child_mode = mode if mode == MODE_SUCCESS: # display only nodes that evaluated successfully. - if not match['success']: + if not match["success"]: return # optional statement with no successful children is empty - if (match['node'].get('statement', {}).get('type') == 'optional' - and not any(map(lambda m: m['success'], match['children']))): + if match["node"].get("statement", {}).get("type") == "optional" and not any( + map(lambda m: m["success"], match["children"]) + ): return # not statement, so invert the child mode to show failed evaluations - if match['node'].get('statement', {}).get('type') == 'not': + if match["node"].get("statement", {}).get("type") == "not": child_mode = MODE_FAILURE elif mode == MODE_FAILURE: # display only nodes that did not evaluate to True - if match['success']: + if match["success"]: return # optional statement with successful children is not relevant - if (match['node'].get('statement', {}).get('type') == 'optional' - and any(map(lambda m: m['success'], match['children']))): + if match["node"].get("statement", {}).get("type") == "optional" and any( + map(lambda m: m["success"], match["children"]) + ): return # not statement, so invert the child mode to show successful evaluations - if match['node'].get('statement', {}).get('type') == 'not': + if match["node"].get("statement", {}).get("type") == "not": child_mode = MODE_SUCCESS else: - raise RuntimeError('unexpected mode: ' + mode) + raise RuntimeError("unexpected mode: " + mode) - render_node(ostream, match, match['node'], indent=indent) + render_node(ostream, match, match["node"], indent=indent) - for child in match['children']: + for child in match["children"]: render_match(ostream, child, indent=indent + 1, mode=child_mode) @@ -151,44 +153,44 @@ def render_vverbose(doc): ostream = rutils.StringIO() for rule in rutils.capability_rules(doc): - count = len(rule['matches']) + count = len(rule["matches"]) if count == 1: - capability = rutils.bold(rule['meta']['name']) + capability = rutils.bold(rule["meta"]["name"]) else: - capability = '%s (%d matches)' % (rutils.bold(rule['meta']['name']), count) + capability = "%s (%d matches)" % (rutils.bold(rule["meta"]["name"]), count) ostream.writeln(capability) rows = [] for key in capa.rules.META_KEYS: - if key == 'name' or key not in rule['meta']: + if key == "name" or key not in rule["meta"]: continue - v = rule['meta'][key] + v = rule["meta"][key] if isinstance(v, list) and len(v) == 1: v = v[0] elif isinstance(v, list) and len(v) > 1: - v = ', '.join(v) + v = ", ".join(v) rows.append((key, v)) - ostream.writeln(tabulate.tabulate(rows, tablefmt='plain')) + ostream.writeln(tabulate.tabulate(rows, tablefmt="plain")) - if rule['meta']['scope'] == capa.rules.FILE_SCOPE: - matches = list(doc[rule['meta']['name']]['matches'].values()) + if rule["meta"]["scope"] == capa.rules.FILE_SCOPE: + matches = list(doc[rule["meta"]["name"]]["matches"].values()) if len(matches) != 1: # i think there should only ever be one match per file-scope rule, # because we do the file-scope evaluation a single time. # but i'm not 100% sure if this is/will always be true. # so, lets be explicit about our assumptions and raise an exception if they fail. - raise RuntimeError('unexpected file scope match count: ' + len(matches)) + raise RuntimeError("unexpected file scope match count: " + len(matches)) render_match(ostream, matches[0], indent=0) else: - for location, match in sorted(doc[rule['meta']['name']]['matches'].items()): - ostream.write(rule['meta']['scope']) - ostream.write(' @ ') + for location, match in sorted(doc[rule["meta"]["name"]]["matches"].items()): + ostream.write(rule["meta"]["scope"]) + ostream.write(" @ ") ostream.writeln(rutils.hex(location)) render_match(ostream, match, indent=1) - ostream.write('\n') + ostream.write("\n") return ostream.getvalue() diff --git a/capa/rules.py b/capa/rules.py index 8e42d0fc..8b307175 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -22,32 +22,32 @@ logger = logging.getLogger(__name__) # these are the standard metadata fields, in the preferred order. # when reformatted, any custom keys will come after these. META_KEYS = ( - 'name', - 'namespace', - 'rule-category', - 'maec/analysis-conclusion', - 'maec/analysis-conclusion-ov', - 'maec/malware-category', - 'maec/malware-category-ov', - 'author', - 'description', - 'lib', - 'scope', - 'att&ck', - 'mbc', - 'references', - 'examples' + "name", + "namespace", + "rule-category", + "maec/analysis-conclusion", + "maec/analysis-conclusion-ov", + "maec/malware-category", + "maec/malware-category-ov", + "author", + "description", + "lib", + "scope", + "att&ck", + "mbc", + "references", + "examples", ) # these are meta fields that are internal to capa, # and added during rule reading/construction. # they may help use manipulate or index rules, # but should not be exposed to clients. -HIDDEN_META_KEYS = ('capa/nursery', 'capa/path') +HIDDEN_META_KEYS = ("capa/nursery", "capa/path") -FILE_SCOPE = 'file' -FUNCTION_SCOPE = 'function' -BASIC_BLOCK_SCOPE = 'basic block' +FILE_SCOPE = "file" +FUNCTION_SCOPE = "function" +BASIC_BLOCK_SCOPE = "basic block" SUPPORTED_FEATURES = { @@ -56,7 +56,7 @@ SUPPORTED_FEATURES = { capa.features.file.Export, capa.features.file.Import, capa.features.file.Section, - capa.features.Characteristic('embedded pe'), + capa.features.Characteristic("embedded pe"), capa.features.String, }, FUNCTION_SCOPE: { @@ -68,18 +68,18 @@ SUPPORTED_FEATURES = { capa.features.insn.Offset, capa.features.insn.Mnemonic, capa.features.basicblock.BasicBlock, - capa.features.Characteristic('switch'), - capa.features.Characteristic('nzxor'), - capa.features.Characteristic('peb access'), - capa.features.Characteristic('fs access'), - capa.features.Characteristic('gs access'), - capa.features.Characteristic('cross section flow'), - capa.features.Characteristic('stack string'), - capa.features.Characteristic('calls from'), - capa.features.Characteristic('calls to'), - capa.features.Characteristic('indirect call'), - capa.features.Characteristic('loop'), - capa.features.Characteristic('recursive call') + capa.features.Characteristic("switch"), + capa.features.Characteristic("nzxor"), + capa.features.Characteristic("peb access"), + capa.features.Characteristic("fs access"), + capa.features.Characteristic("gs access"), + capa.features.Characteristic("cross section flow"), + capa.features.Characteristic("stack string"), + capa.features.Characteristic("calls from"), + capa.features.Characteristic("calls to"), + capa.features.Characteristic("indirect call"), + capa.features.Characteristic("loop"), + capa.features.Characteristic("recursive call"), }, BASIC_BLOCK_SCOPE: { capa.features.MatchedRule, @@ -89,14 +89,14 @@ SUPPORTED_FEATURES = { capa.features.Bytes, capa.features.insn.Offset, capa.features.insn.Mnemonic, - capa.features.Characteristic('nzxor'), - capa.features.Characteristic('peb access'), - capa.features.Characteristic('fs access'), - capa.features.Characteristic('gs access'), - capa.features.Characteristic('cross section flow'), - capa.features.Characteristic('tight loop'), - capa.features.Characteristic('stack string'), - capa.features.Characteristic('indirect call') + capa.features.Characteristic("nzxor"), + capa.features.Characteristic("peb access"), + capa.features.Characteristic("fs access"), + capa.features.Characteristic("gs access"), + capa.features.Characteristic("cross section flow"), + capa.features.Characteristic("tight loop"), + capa.features.Characteristic("stack string"), + capa.features.Characteristic("indirect call"), }, } @@ -107,7 +107,7 @@ class InvalidRule(ValueError): self.msg = msg def __str__(self): - return 'invalid rule: %s' % (self.msg) + return "invalid rule: %s" % (self.msg) def __repr__(self): return str(self) @@ -121,7 +121,7 @@ class InvalidRuleWithPath(InvalidRule): self.__cause__ = None def __str__(self): - return 'invalid rule: %s: %s' % (self.path, self.msg) + return "invalid rule: %s: %s" % (self.path, self.msg) class InvalidRuleSet(ValueError): @@ -130,7 +130,7 @@ class InvalidRuleSet(ValueError): self.msg = msg def __str__(self): - return 'invalid rule set: %s' % (self.msg) + return "invalid rule set: %s" % (self.msg) def __repr__(self): return str(self) @@ -139,111 +139,112 @@ class InvalidRuleSet(ValueError): def ensure_feature_valid_for_scope(scope, feature): if isinstance(feature, capa.features.Characteristic): if capa.features.Characteristic(feature.name) not in SUPPORTED_FEATURES[scope]: - raise InvalidRule('feature %s not support for scope %s' % (feature, scope)) + raise InvalidRule("feature %s not support for scope %s" % (feature, scope)) elif not isinstance(feature, tuple(filter(lambda t: isinstance(t, type), SUPPORTED_FEATURES[scope]))): - raise InvalidRule('feature %s not support for scope %s' % (feature, scope)) + raise InvalidRule("feature %s not support for scope %s" % (feature, scope)) def parse_int(s): - if s.startswith('0x'): + if s.startswith("0x"): return int(s, 0x10) else: return int(s, 10) def parse_range(s): - ''' + """ parse a string "(0, 1)" into a range (min, max). min and/or max may by None to indicate an unbound range. - ''' + """ # we want to use `{` characters, but this is a dict in yaml. - if not s.startswith('('): - raise InvalidRule('invalid range: %s' % (s)) + if not s.startswith("("): + raise InvalidRule("invalid range: %s" % (s)) - if not s.endswith(')'): - raise InvalidRule('invalid range: %s' % (s)) + if not s.endswith(")"): + raise InvalidRule("invalid range: %s" % (s)) - s = s[len('('):-len(')')] - min, _, max = s.partition(',') + s = s[len("(") : -len(")")] + min, _, max = s.partition(",") min = min.strip() max = max.strip() if min: min = parse_int(min.strip()) if min < 0: - raise InvalidRule('range min less than zero') + raise InvalidRule("range min less than zero") else: min = None if max: max = parse_int(max.strip()) if max < 0: - raise InvalidRule('range max less than zero') + raise InvalidRule("range max less than zero") else: max = None if min is not None and max is not None: if max < min: - raise InvalidRule('range max less than min') + raise InvalidRule("range max less than min") return min, max def parse_feature(key): # keep this in sync with supported features - if key == 'api': + if key == "api": return capa.features.insn.API - elif key == 'string': + elif key == "string": return capa.features.String - elif key == 'bytes': + elif key == "bytes": return capa.features.Bytes - elif key == 'number': + elif key == "number": return capa.features.insn.Number - elif key == 'offset': + elif key == "offset": return capa.features.insn.Offset - elif key == 'mnemonic': + elif key == "mnemonic": return capa.features.insn.Mnemonic - elif key == 'basic blocks': + elif key == "basic blocks": return capa.features.basicblock.BasicBlock - elif key.startswith('characteristic(') and key.endswith(')'): - characteristic = key[len('characteristic('):-len(')')] + elif key.startswith("characteristic(") and key.endswith(")"): + characteristic = key[len("characteristic(") : -len(")")] return lambda v: capa.features.Characteristic(characteristic, v) - elif key == 'export': + elif key == "export": return capa.features.file.Export - elif key == 'import': + elif key == "import": return capa.features.file.Import - elif key == 'section': + elif key == "section": return capa.features.file.Section - elif key == 'match': + elif key == "match": return capa.features.MatchedRule else: - raise InvalidRule('unexpected statement: %s' % key) + raise InvalidRule("unexpected statement: %s" % key) def parse_symbol(s, value_type): - ''' + """ s can be an int or a string - ''' - if isinstance(s, str) and '=' in s: - value, symbol = s.split('=', 1) + """ + if isinstance(s, str) and "=" in s: + value, symbol = s.split("=", 1) symbol = symbol.strip() - if symbol == '': + if symbol == "": raise InvalidRule('unexpected value: "%s", symbol name cannot be empty' % s) else: value = s symbol = None if isinstance(value, str): - if value_type == 'bytes': + if value_type == "bytes": try: - value = codecs.decode(value.replace(' ', ''), 'hex') + value = codecs.decode(value.replace(" ", ""), "hex") # TODO: Remove TypeError when Python2 is not used anymore except (TypeError, binascii.Error): raise InvalidRule('unexpected bytes value: "%s", must be a valid hex sequence' % value) if len(value) > MAX_BYTES_FEATURE_SIZE: - raise InvalidRule('unexpected bytes value: byte sequences must be no larger than %s bytes' % - MAX_BYTES_FEATURE_SIZE) + raise InvalidRule( + "unexpected bytes value: byte sequences must be no larger than %s bytes" % MAX_BYTES_FEATURE_SIZE + ) else: try: value = parse_int(value) @@ -255,54 +256,54 @@ def parse_symbol(s, value_type): def build_statements(d, scope): if len(d.keys()) != 1: - raise InvalidRule('too many statements') + raise InvalidRule("too many statements") key = list(d.keys())[0] - if key == 'and': + if key == "and": return And(*[build_statements(dd, scope) for dd in d[key]]) - elif key == 'or': + elif key == "or": return Or(*[build_statements(dd, scope) for dd in d[key]]) - elif key == 'not': + elif key == "not": if len(d[key]) != 1: - raise InvalidRule('not statement must have exactly one child statement') + raise InvalidRule("not statement must have exactly one child statement") return Not(*[build_statements(dd, scope) for dd in d[key]]) - elif key.endswith(' or more'): - count = int(key[:-len('or more')]) + elif key.endswith(" or more"): + count = int(key[: -len("or more")]) return Some(count, *[build_statements(dd, scope) for dd in d[key]]) - elif key == 'optional': + elif key == "optional": # `optional` is an alias for `0 or more` # which is useful for documenting behaviors, # like with `write file`, we might say that `WriteFile` is optionally found alongside `CreateFileA`. return Some(0, *[build_statements(dd, scope) for dd in d[key]]) - elif key == 'function': + elif key == "function": if scope != FILE_SCOPE: - raise InvalidRule('function subscope supported only for file scope') + raise InvalidRule("function subscope supported only for file scope") if len(d[key]) != 1: - raise InvalidRule('subscope must have exactly one child statement') + raise InvalidRule("subscope must have exactly one child statement") return Subscope(FUNCTION_SCOPE, *[build_statements(dd, FUNCTION_SCOPE) for dd in d[key]]) - elif key == 'basic block': + elif key == "basic block": if scope != FUNCTION_SCOPE: - raise InvalidRule('basic block subscope supported only for function scope') + raise InvalidRule("basic block subscope supported only for function scope") if len(d[key]) != 1: - raise InvalidRule('subscope must have exactly one child statement') + raise InvalidRule("subscope must have exactly one child statement") return Subscope(BASIC_BLOCK_SCOPE, *[build_statements(dd, BASIC_BLOCK_SCOPE) for dd in d[key]]) - elif key.startswith('count(') and key.endswith(')'): + elif key.startswith("count(") and key.endswith(")"): # e.g.: # # count(basic block) # count(mnemonic(mov)) # count(characteristic(nzxor)) - term = key[len('count('):-len(')')] + term = key[len("count(") : -len(")")] - if term.startswith('characteristic('): + if term.startswith("characteristic("): # characteristic features are specified a bit specially: # they simply indicate the presence of something unusual/interesting, # and we embed the name in the feature name, like `characteristic(nzxor)`. @@ -320,18 +321,18 @@ def build_statements(d, scope): # - mnemonic: mov # # but here we deal with the form: `mnemonic(mov)`. - term, _, arg = term.partition('(') + term, _, arg = term.partition("(") Feature = parse_feature(term) if arg: - arg = arg[:-len(')')] + arg = arg[: -len(")")] # can't rely on yaml parsing ints embedded within strings # like: # # count(offset(0xC)) # count(number(0x11223344)) # count(number(0x100 = symbol name)) - if term in ('number', 'offset', 'bytes'): + if term in ("number", "offset", "bytes"): value, symbol = parse_symbol(arg, term) feature = Feature(value, symbol) else: @@ -348,29 +349,31 @@ def build_statements(d, scope): count = d[key] if isinstance(count, int): return Range(feature, min=count, max=count) - elif count.endswith(' or more'): - min = parse_int(count[:-len(' or more')]) + elif count.endswith(" or more"): + min = parse_int(count[: -len(" or more")]) max = None return Range(feature, min=min, max=max) - elif count.endswith(' or fewer'): + elif count.endswith(" or fewer"): min = None - max = parse_int(count[:-len(' or fewer')]) + max = parse_int(count[: -len(" or fewer")]) return Range(feature, min=min, max=max) - elif count.startswith('('): + elif count.startswith("("): min, max = parse_range(count) return Range(feature, min=min, max=max) else: - raise InvalidRule('unexpected range: %s' % (count)) - elif key == 'string' and d[key].startswith('/') and (d[key].endswith('/') or d[key].endswith('/i')): + raise InvalidRule("unexpected range: %s" % (count)) + elif key == "string" and d[key].startswith("/") and (d[key].endswith("/") or d[key].endswith("/i")): try: return Regex(d[key]) except re.error: - if d[key].endswith('/i'): - d[key] = d[key][:-len('i')] - raise InvalidRule('invalid regular expression: %s it should use Python syntax, try it at https://pythex.org' % d[key]) + if d[key].endswith("/i"): + d[key] = d[key][: -len("i")] + raise InvalidRule( + "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % d[key] + ) else: Feature = parse_feature(key) - if key in ('number', 'offset', 'bytes'): + if key in ("number", "offset", "bytes"): # parse numbers with symbol description, e.g. 0x4550 = IMAGE_DOS_SIGNATURE # or regular numbers, e.g. 37 value, symbol = parse_symbol(d[key], key) @@ -390,7 +393,7 @@ def second(s): # we use the ruamel.yaml parser because it supports roundtripping of documents with comments. -yaml = ruamel.yaml.YAML(typ='rt') +yaml = ruamel.yaml.YAML(typ="rt") # use block mode, not inline json-like mode @@ -410,7 +413,7 @@ yaml.width = 4096 class Rule(object): - def __init__(self, name, scope, statement, meta, definition=''): + def __init__(self, name, scope, statement, meta, definition=""): super(Rule, self).__init__() self.name = name self.scope = scope @@ -419,13 +422,13 @@ class Rule(object): self.definition = definition def __str__(self): - return 'Rule(name=%s)' % (self.name) + return "Rule(name=%s)" % (self.name) def __repr__(self): - return 'Rule(scope=%s, name=%s)' % (self.scope, self.name) + return "Rule(scope=%s, name=%s)" % (self.scope, self.name) def get_dependencies(self, namespaces): - ''' + """ fetch the names of rules this rule relies upon. these are only the direct dependencies; a user must compute the transitive dependency graph themself, if they want it. @@ -436,7 +439,7 @@ class Rule(object): Returns: List[str]: names of rules upon which this rule depends. - ''' + """ deps = set([]) def rec(statement): @@ -469,24 +472,31 @@ class Rule(object): def _extract_subscope_rules_rec(self, statement): if isinstance(statement, Statement): # for each child that is a subscope, - for subscope in filter(lambda statement: isinstance(statement, capa.engine.Subscope), statement.get_children()): + for subscope in filter( + lambda statement: isinstance(statement, capa.engine.Subscope), statement.get_children() + ): # create a new rule from it. # the name is a randomly generated, hopefully unique value. # ideally, this won't every be rendered to a user. - name = self.name + '/' + uuid.uuid4().hex - new_rule = Rule(name, subscope.scope, subscope.child, { - 'name': name, - 'scope': subscope.scope, - # these derived rules are never meant to be inspected separately, - # they are dependencies for the parent rule, - # so mark it as such. - 'lib': True, - # metadata that indicates this is derived from a subscope statement - 'capa/subscope-rule': True, - # metadata that links the child rule the parent rule - 'capa/parent': self.name, - }) + name = self.name + "/" + uuid.uuid4().hex + new_rule = Rule( + name, + subscope.scope, + subscope.child, + { + "name": name, + "scope": subscope.scope, + # these derived rules are never meant to be inspected separately, + # they are dependencies for the parent rule, + # so mark it as such. + "lib": True, + # metadata that indicates this is derived from a subscope statement + "capa/subscope-rule": True, + # metadata that links the child rule the parent rule + "capa/parent": self.name, + }, + ) # update the existing statement to `match` the new rule new_node = capa.features.MatchedRule(name) @@ -503,7 +513,7 @@ class Rule(object): yield new_rule def extract_subscope_rules(self): - ''' + """ scan through the statements of this rule, replacing subscope statements with `match` references to a newly created rule, which are yielded from this routine. @@ -514,7 +524,7 @@ class Rule(object): for derived_rule in rule.extract_subscope_rules(): assert derived_rule.meta['capa/parent'] == rule.name - ''' + """ # recurse through statements # when encounter Subscope statement @@ -531,27 +541,21 @@ class Rule(object): @classmethod def from_dict(cls, d, s): - name = d['rule']['meta']['name'] + name = d["rule"]["meta"]["name"] # if scope is not specified, default to function scope. # this is probably the mode that rule authors will start with. - scope = d['rule']['meta'].get('scope', FUNCTION_SCOPE) - statements = d['rule']['features'] + scope = d["rule"]["meta"].get("scope", FUNCTION_SCOPE) + statements = d["rule"]["features"] # the rule must start with a single logic node. # doing anything else is too implicit and difficult to remove (AND vs OR ???). if len(statements) != 1: - raise InvalidRule('rule must begin with a single top level statement') + raise InvalidRule("rule must begin with a single top level statement") if isinstance(statements[0], capa.engine.Subscope): - raise InvalidRule('top level statement may not be a subscope') + raise InvalidRule("top level statement may not be a subscope") - return cls( - name, - scope, - build_statements(statements[0], scope), - d['rule']['meta'], - s - ) + return cls(name, scope, build_statements(statements[0], scope), d["rule"]["meta"], s) @classmethod def from_yaml(cls, s): @@ -559,9 +563,9 @@ class Rule(object): @classmethod def from_yaml_file(cls, path): - with open(path, 'rb') as f: + with open(path, "rb") as f: try: - return cls.from_yaml(f.read().decode('utf-8')) + return cls.from_yaml(f.read().decode("utf-8")) except InvalidRule as e: raise InvalidRuleWithPath(path, str(e)) @@ -578,11 +582,11 @@ class Rule(object): definition = yaml.load(self.definition) # definition retains a reference to `meta`, # so we're updating that in place. - definition['rule']['meta'] = self.meta + definition["rule"]["meta"] = self.meta meta = self.meta - meta['name'] = self.name - meta['scope'] = self.scope + meta["name"] = self.name + meta["scope"] = self.scope def move_to_end(m, k): # ruamel.yaml uses an ordereddict-like structure to track maps (CommentedMap). @@ -592,8 +596,8 @@ class Rule(object): del m[k] m[k] = v - move_to_end(definition['rule'], 'meta') - move_to_end(definition['rule'], 'features') + move_to_end(definition["rule"], "meta") + move_to_end(definition["rule"], "features") for key in META_KEYS: if key in meta: @@ -624,11 +628,11 @@ class Rule(object): continue meta[key] = value - return ostream.getvalue().decode('utf-8').rstrip('\n') + '\n' + return ostream.getvalue().decode("utf-8").rstrip("\n") + "\n" def get_rules_with_scope(rules, scope): - ''' + """ from the given collection of rules, select those with the given scope. args: @@ -637,12 +641,12 @@ def get_rules_with_scope(rules, scope): returns: List[capa.rules.Rule]: - ''' + """ return list(rule for rule in rules if rule.scope == scope) def get_rules_and_dependencies(rules, rule_name): - ''' + """ from the given collection of rules, select a rule and its dependencies (transitively). args: @@ -651,7 +655,7 @@ def get_rules_and_dependencies(rules, rule_name): yields: Rule: - ''' + """ # we evaluate `rules` multiple times, so if its a generator, realize it into a list. rules = list(rules) namespaces = index_rules_by_namespace(rules) @@ -674,17 +678,17 @@ def ensure_rules_are_unique(rules): seen = set([]) for rule in rules: if rule.name in seen: - raise InvalidRule('duplicate rule name: ' + rule.name) + raise InvalidRule("duplicate rule name: " + rule.name) seen.add(rule.name) def ensure_rule_dependencies_are_met(rules): - ''' + """ raise an exception if a rule dependency does not exist. raises: InvalidRule: if a dependency is not met. - ''' + """ # we evaluate `rules` multiple times, so if its a generator, realize it into a list. rules = list(rules) namespaces = index_rules_by_namespace(rules) @@ -696,7 +700,7 @@ def ensure_rule_dependencies_are_met(rules): def index_rules_by_namespace(rules): - ''' + """ compute the rules that fit into each namespace found within the given rules. for example, given: @@ -714,23 +718,23 @@ def index_rules_by_namespace(rules): rules (List[Rule]): Returns: Dict[str, List[Rule]] - ''' + """ namespaces = collections.defaultdict(list) for rule in rules: - namespace = rule.meta.get('namespace') + namespace = rule.meta.get("namespace") if not namespace: continue while namespace: namespaces[namespace].append(rule) - namespace, _, _ = namespace.rpartition('/') + namespace, _, _ = namespace.rpartition("/") return dict(namespaces) class RuleSet(object): - ''' + """ a ruleset is initialized with a collection of rules, which it verifies and sorts into scopes. each set of scoped rules is sorted topologically, which enables rules to match on past rule matches. @@ -742,7 +746,7 @@ class RuleSet(object): ... ]) capa.engine.match(ruleset.file_rules, ...) - ''' + """ def __init__(self, rules): super(RuleSet, self).__init__() @@ -754,7 +758,7 @@ class RuleSet(object): ensure_rule_dependencies_are_met(rules) if len(rules) == 0: - raise InvalidRuleSet('no rules selected') + raise InvalidRuleSet("no rules selected") self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE) self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE) @@ -769,12 +773,12 @@ class RuleSet(object): @staticmethod def _get_rules_for_scope(rules, scope): - ''' + """ given a collection of rules, collect the rules that are needed at the given scope. these rules are ordered topologically. don't include "lib" rules, unless they are dependencies of other rules. - ''' + """ scope_rules = set([]) # we need to process all rules, not just rules with the given scope. @@ -782,7 +786,7 @@ class RuleSet(object): # at lower scope, e.g. function scope. # so, we find all dependencies of all rules, and later will filter them down. for rule in rules: - if rule.meta.get('lib', False): + if rule.meta.get("lib", False): continue scope_rules.update(get_rules_and_dependencies(rules, rule.name)) @@ -790,7 +794,7 @@ class RuleSet(object): @staticmethod def _extract_subscope_rules(rules): - ''' + """ process the given sequence of rules. for each one, extract any embedded subscope rules into their own rule. process these recursively. @@ -798,7 +802,7 @@ class RuleSet(object): note: this operation mutates the rules passed in - they may now have `match` statements for the extracted subscope rules. - ''' + """ done = [] # use a queue of rules, because we'll be modifying the list (appending new items) as we go. @@ -811,14 +815,14 @@ class RuleSet(object): return done def filter_rules_by_meta(self, tag): - ''' + """ return new rule set with rules filtered based on all meta field values, adds all dependency rules apply tag-based rule filter assuming that all required rules are loaded can be used to specify selected rules vs. providing a rules child directory where capa cannot resolve dependencies from unknown paths TODO handle circular dependencies? TODO support -t=metafield - ''' + """ rules = self.rules.values() rules_filtered = set([]) for rule in rules: diff --git a/capa/version.py b/capa/version.py index 8d5e01ee..91b491f2 100644 --- a/capa/version.py +++ b/capa/version.py @@ -1,2 +1,2 @@ -__version__ = '0.0.0' -__commit__ = '00000000' +__version__ = "0.0.0" +__commit__ = "00000000"