From 358aab85e7da4e502497e73542392cf311c3393c Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 4 Mar 2021 14:51:40 -0700 Subject: [PATCH] viv: move FLIRT matching into viv-utils --- capa/features/extractors/viv/__init__.py | 266 +---------------------- capa/features/extractors/viv/insn.py | 3 +- setup.py | 1 + 3 files changed, 6 insertions(+), 264 deletions(-) diff --git a/capa/features/extractors/viv/__init__.py b/capa/features/extractors/viv/__init__.py index 071495a1..ef0d59ed 100644 --- a/capa/features/extractors/viv/__init__.py +++ b/capa/features/extractors/viv/__init__.py @@ -10,7 +10,7 @@ import types import logging import viv_utils -import vivisect.const +import viv_utils.flirt import capa.features.extractors import capa.features.extractors.viv.file @@ -64,41 +64,6 @@ class VivisectFeatureExtractor(FeatureExtractor): self.vw = vw self.path = path - import flirt - - # vc32rtf.sig: - # 60,195 total signatures - # parsing sig: 0.13s - # compiling sigs: 1.18s - # - # libcmt_15_msvc_x86 - # 396 total signatures - # parsing pat: 0.09s - # parsing sigs: 0.01s - - #sigfile = "" - sigfile = "vc32rtf.sig" - #sigfile = "libcmt_15_msvc_x86.pat" - - if sigfile.endswith(".sig"): - with open(sigfile, "rb") as f: - with timing("flirt: parsing .sig: " + sigfile): - sigs = flirt.parse_sig(f.read()) - elif sigfile.endswith(".pat"): - with open(sigfile, "rb") as f: - with timing("flirt: parsing .pat: " + sigfile): - sigs = flirt.parse_pat(f.read().decode("utf-8")) - else: - sigs = [] - - logger.debug("flirt: sig count: %d", len(sigs)) - - with timing("flirt: compiling sigs"): - matcher = flirt.compile(sigs) - - with timing("flirt: matching sigs"): - match_vw_flirt_signatures(matcher, vw) - def get_base_address(self): # assume there is only one file loaded into the vw return list(self.vw.filemeta.values())[0]["imagebase"] @@ -132,232 +97,7 @@ class VivisectFeatureExtractor(FeatureExtractor): yield feature, va def is_library_function(self, va): - return is_library_function(self.vw, va) + return viv_utils.flirt.is_library_function(self.vw, va) def get_function_name(self, va): - return viv_utils.get_function_name(self.vw, va) - - -# vivisect funcmeta key for a bool to indicate if a function is recognized from a library. -# not expecting anyone to use this, aka private symbol. -_LIBRARY_META_KEY = "is-library" - - -def is_library_function(vw, va): - """ - is the function at the given address a library function? - this may be determined by a signature matching backend. - if there's no function at the given address, `False` is returned. - - note: if its a library function, it should also have a name set. - - args: - vw (vivisect.Workspace): - va (int): the virtual address of a function. - - returns: - bool: if the function is recognized as from a library. - """ - return vw.funcmeta.get(va, {}).get(_LIBRARY_META_KEY, False) - - -def make_library_function(vw, va): - """ - mark the function with the given address a library function. - the associated accessor is `is_library_function`. - - if there's no function at the given address, this routine has no effect. - - note: if its a library function, it should also have a name set. - its up to the caller to do this part. - - args: - vw (vivisect.Workspace): - va (int): the virtual address of a function. - """ - fmeta = vw.funcmeta.get(va, {}) - fmeta[_LIBRARY_META_KEY] = True - - -def add_function_flirt_match(vw, va, name): - """ - mark the function at the given address as a library function with the given name. - the name overrides any existing function name. - - args: - vw (vivisect.Workspace): - va (int): the virtual address of a function. - name (str): the name to assign to the function. - """ - make_library_function(vw, va) - viv_utils.set_function_name(vw, va, name) - - -def get_match_name(match): - """ - fetch the best name for a `flirt.FlirtSignature` instance. - these instances returned by `flirt.FlirtMatcher.match()` - may have multiple names, such as public and local names for different parts - of a function. the best name is that at offset zero (the function name). - - probably every signature has a best name, though I'm not 100% sure. - - args: - match (flirt.FlirtSignature): the signature to get a name from. - - returns: - str: the best name of the function matched by the given signature. - """ - for (name, type_, offset) in match.names: - if offset == 0: - return name - raise ValueError("flirt: match: no best name: %s", match.names) - - -def match_function_flirt_signatures(matcher, vw, va): - """ - match the given FLIRT signatures against the function at the given address. - upon success, update the workspace with match metadata, setting the - function as a library function and assigning its name. - - if multiple different signatures match the function, don't do anything. - - args: - match (flirt.FlirtMatcher): the compiled FLIRT signature matcher. - vw (vivisect.workspace): the analyzed program's workspace. - va (int): the virtual address of a function to match. - - returns: - Optional[str]: the recognized function name, or `None`. - """ - function_meta = vw.funcmeta.get(va) - if not function_meta: - # not a function, we're not going to consider this. - return None - - if is_library_function(vw, va): - # already matched here. - # this might be the case if recursive matching visited this address. - return viv_utils.get_function_name(vw, va) - - # 0x200 comes from: - # 0x20 bytes for default byte signature size in flirt - # 0x100 bytes for max checksum data size - # some wiggle room for tail bytes - size = function_meta.get("Size", 0x200) - # TODO: fix reads at the end of a section. - buf = vw.readMemory(va, size) - - matches = [] - for match in matcher.match(buf): - # collect all the name tuples (name, type, offset) with type==reference. - # ignores other name types like "public" and "local". - references = list(filter(lambda n: n[1] == "reference", match.names)) - - if not references: - # there are no references that we need to check, so this is a complete match. - # common case. - matches.append(match) - - else: - # flirt uses reference names to assert that - # the function contains a reference to another function with a given name. - # - # we need to loop through these references, - # potentially recursively FLIRT match, - # and check the name matches (or doesn't). - - # at the end of the following loop, - # if this flag is still true, - # then all the references have been validated. - does_match_references = True - - #logger.debug("flirt: references needed for name %s for function at 0x%x: %s", get_match_name(match), va, references) - - # when a reference is used to differentiate rule matches, - # then we can expect multiple rules to query the name of the same address. - # so, this caches the names looked up in the below loop. - # type: Map[int, str] - local_names = {} - for (ref_name, _, ref_offset) in references: - ref_va = va + ref_offset - - # the reference offset may be inside an instruction, - # so we use getLocation to select the containing instruction address. - loc_va = vw.getLocation(ref_va)[vivisect.const.L_VA] - - # an instruction may have multiple xrefs from - # so we loop through all code references, - # searching for that name. - # - # TODO: if we assume there is a single code reference, this is a lot easier. - # can we do that with FLIRT? - # - # if the name is found, then this flag will be set. - does_match_the_reference = False - for xref in vw.getXrefsFrom(loc_va): - # FLIRT signatures only match code, - # so we're only going to resolve references that point to code. - if xref[vivisect.const.XR_RTYPE] != vivisect.const.REF_CODE: - continue - - target = xref[vivisect.const.XR_TO] - if target in local_names: - # fast path: a prior loop already looked up this address. - found_name = local_names[target] - else: - # this is a bit slower, since we have to read buf, do match, etc. - # note that we don't ever save "this is not a library function", - # so there's not an easy way to short circuit at the start of this function. - found_name = match_function_flirt_signatures(matcher, vw, target) - local_names[target] = found_name - - #logger.debug("flirt: reference: 0x%x: 0x%x: wanted: %s found: %s", loc_va, target, ref_name, found_name) - - if found_name == ref_name: - does_match_the_reference = True - break - - if not does_match_the_reference: - does_match_references = False - break - - if does_match_references: - # only if all references pass do we count it. - matches.append(match) - - if matches: - # we may have multiple signatures that match the same function, like `strcpy`. - # these could be copies from multiple libraries. - # so we don't mind if there are multiple matches, as long as names are the same. - # - # but if there are multiple candidate names, that's a problem. - # our signatures are not precise enough. - # we could maybe mark the function as "is a library function", but not assign name. - # though, if we have signature FPs among library functions, it could easily FP with user code too. - # so safest thing to do is not make any claim about the function. - names = list(set(map(get_match_name, matches))) - if len(names) == 1: - name = names[0] - add_function_flirt_match(vw, va, name) - logger.debug("flirt: found library function: 0x%x: %s", va, name) - return name - else: - logger.warning("flirt: conflicting names: 0x%x: %s", va, names) - return None - - -def match_vw_flirt_signatures(matcher, vw): - """ - enumerate all functions in the workspace and match the given FLIRT signatures. - upon each success, update the workspace with match metadata, setting the - function as a library function and assigning its name. - - if multiple different signatures match a function, don't do anything. - - args: - match (flirt.FlirtMatcher): the compiled FLIRT signature matcher. - vw (vivisect.workspace): the analyzed program's workspace. - """ - for va in sorted(vw.getFunctions()): - match_function_flirt_signatures(matcher, vw, va) \ No newline at end of file + return viv_utils.get_function_name(self.vw, va) \ No newline at end of file diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py index a797b413..e528f748 100644 --- a/capa/features/extractors/viv/insn.py +++ b/capa/features/extractors/viv/insn.py @@ -7,6 +7,7 @@ # See the License for the specific language governing permissions and limitations under the License. import viv_utils +import viv_utils.flirt import envi.memory import envi.archs.i386.disasm @@ -112,7 +113,7 @@ def extract_insn_api_features(f, bb, insn): if not target: return - if capa.features.extractors.viv.is_library_function(f.vw, target): + if viv_utils.flirt.is_library_function(f.vw, target): name = viv_utils.get_function_name(f.vw, target) yield API(name), insn.va return diff --git a/setup.py b/setup.py index 4d0c63ac..35b16170 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,7 @@ if sys.version_info >= (3, 0): requirements.append("vivisect") requirements.append("viv-utils==0.3.19") requirements.append("smda==1.5.13") + requirements.append("python-flirt~=0.5.4") else: # py2 requirements.append("enum34==1.1.6") # v1.1.6 is needed by halo 0.0.30 / spinners 0.0.24