mirror of
https://github.com/mandiant/capa.git
synced 2025-12-22 07:10:29 -08:00
viv: move FLIRT matching into viv-utils
This commit is contained in:
@@ -10,7 +10,7 @@ import types
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
import viv_utils
|
import viv_utils
|
||||||
import vivisect.const
|
import viv_utils.flirt
|
||||||
|
|
||||||
import capa.features.extractors
|
import capa.features.extractors
|
||||||
import capa.features.extractors.viv.file
|
import capa.features.extractors.viv.file
|
||||||
@@ -64,41 +64,6 @@ class VivisectFeatureExtractor(FeatureExtractor):
|
|||||||
self.vw = vw
|
self.vw = vw
|
||||||
self.path = path
|
self.path = path
|
||||||
|
|
||||||
import flirt
|
|
||||||
|
|
||||||
# vc32rtf.sig:
|
|
||||||
# 60,195 total signatures
|
|
||||||
# parsing sig: 0.13s
|
|
||||||
# compiling sigs: 1.18s
|
|
||||||
#
|
|
||||||
# libcmt_15_msvc_x86
|
|
||||||
# 396 total signatures
|
|
||||||
# parsing pat: 0.09s
|
|
||||||
# parsing sigs: 0.01s
|
|
||||||
|
|
||||||
#sigfile = ""
|
|
||||||
sigfile = "vc32rtf.sig"
|
|
||||||
#sigfile = "libcmt_15_msvc_x86.pat"
|
|
||||||
|
|
||||||
if sigfile.endswith(".sig"):
|
|
||||||
with open(sigfile, "rb") as f:
|
|
||||||
with timing("flirt: parsing .sig: " + sigfile):
|
|
||||||
sigs = flirt.parse_sig(f.read())
|
|
||||||
elif sigfile.endswith(".pat"):
|
|
||||||
with open(sigfile, "rb") as f:
|
|
||||||
with timing("flirt: parsing .pat: " + sigfile):
|
|
||||||
sigs = flirt.parse_pat(f.read().decode("utf-8"))
|
|
||||||
else:
|
|
||||||
sigs = []
|
|
||||||
|
|
||||||
logger.debug("flirt: sig count: %d", len(sigs))
|
|
||||||
|
|
||||||
with timing("flirt: compiling sigs"):
|
|
||||||
matcher = flirt.compile(sigs)
|
|
||||||
|
|
||||||
with timing("flirt: matching sigs"):
|
|
||||||
match_vw_flirt_signatures(matcher, vw)
|
|
||||||
|
|
||||||
def get_base_address(self):
|
def get_base_address(self):
|
||||||
# assume there is only one file loaded into the vw
|
# assume there is only one file loaded into the vw
|
||||||
return list(self.vw.filemeta.values())[0]["imagebase"]
|
return list(self.vw.filemeta.values())[0]["imagebase"]
|
||||||
@@ -132,232 +97,7 @@ class VivisectFeatureExtractor(FeatureExtractor):
|
|||||||
yield feature, va
|
yield feature, va
|
||||||
|
|
||||||
def is_library_function(self, va):
|
def is_library_function(self, va):
|
||||||
return is_library_function(self.vw, va)
|
return viv_utils.flirt.is_library_function(self.vw, va)
|
||||||
|
|
||||||
def get_function_name(self, va):
|
def get_function_name(self, va):
|
||||||
return viv_utils.get_function_name(self.vw, va)
|
return viv_utils.get_function_name(self.vw, va)
|
||||||
|
|
||||||
|
|
||||||
# vivisect funcmeta key for a bool to indicate if a function is recognized from a library.
|
|
||||||
# not expecting anyone to use this, aka private symbol.
|
|
||||||
_LIBRARY_META_KEY = "is-library"
|
|
||||||
|
|
||||||
|
|
||||||
def is_library_function(vw, va):
|
|
||||||
"""
|
|
||||||
is the function at the given address a library function?
|
|
||||||
this may be determined by a signature matching backend.
|
|
||||||
if there's no function at the given address, `False` is returned.
|
|
||||||
|
|
||||||
note: if its a library function, it should also have a name set.
|
|
||||||
|
|
||||||
args:
|
|
||||||
vw (vivisect.Workspace):
|
|
||||||
va (int): the virtual address of a function.
|
|
||||||
|
|
||||||
returns:
|
|
||||||
bool: if the function is recognized as from a library.
|
|
||||||
"""
|
|
||||||
return vw.funcmeta.get(va, {}).get(_LIBRARY_META_KEY, False)
|
|
||||||
|
|
||||||
|
|
||||||
def make_library_function(vw, va):
|
|
||||||
"""
|
|
||||||
mark the function with the given address a library function.
|
|
||||||
the associated accessor is `is_library_function`.
|
|
||||||
|
|
||||||
if there's no function at the given address, this routine has no effect.
|
|
||||||
|
|
||||||
note: if its a library function, it should also have a name set.
|
|
||||||
its up to the caller to do this part.
|
|
||||||
|
|
||||||
args:
|
|
||||||
vw (vivisect.Workspace):
|
|
||||||
va (int): the virtual address of a function.
|
|
||||||
"""
|
|
||||||
fmeta = vw.funcmeta.get(va, {})
|
|
||||||
fmeta[_LIBRARY_META_KEY] = True
|
|
||||||
|
|
||||||
|
|
||||||
def add_function_flirt_match(vw, va, name):
|
|
||||||
"""
|
|
||||||
mark the function at the given address as a library function with the given name.
|
|
||||||
the name overrides any existing function name.
|
|
||||||
|
|
||||||
args:
|
|
||||||
vw (vivisect.Workspace):
|
|
||||||
va (int): the virtual address of a function.
|
|
||||||
name (str): the name to assign to the function.
|
|
||||||
"""
|
|
||||||
make_library_function(vw, va)
|
|
||||||
viv_utils.set_function_name(vw, va, name)
|
|
||||||
|
|
||||||
|
|
||||||
def get_match_name(match):
|
|
||||||
"""
|
|
||||||
fetch the best name for a `flirt.FlirtSignature` instance.
|
|
||||||
these instances returned by `flirt.FlirtMatcher.match()`
|
|
||||||
may have multiple names, such as public and local names for different parts
|
|
||||||
of a function. the best name is that at offset zero (the function name).
|
|
||||||
|
|
||||||
probably every signature has a best name, though I'm not 100% sure.
|
|
||||||
|
|
||||||
args:
|
|
||||||
match (flirt.FlirtSignature): the signature to get a name from.
|
|
||||||
|
|
||||||
returns:
|
|
||||||
str: the best name of the function matched by the given signature.
|
|
||||||
"""
|
|
||||||
for (name, type_, offset) in match.names:
|
|
||||||
if offset == 0:
|
|
||||||
return name
|
|
||||||
raise ValueError("flirt: match: no best name: %s", match.names)
|
|
||||||
|
|
||||||
|
|
||||||
def match_function_flirt_signatures(matcher, vw, va):
|
|
||||||
"""
|
|
||||||
match the given FLIRT signatures against the function at the given address.
|
|
||||||
upon success, update the workspace with match metadata, setting the
|
|
||||||
function as a library function and assigning its name.
|
|
||||||
|
|
||||||
if multiple different signatures match the function, don't do anything.
|
|
||||||
|
|
||||||
args:
|
|
||||||
match (flirt.FlirtMatcher): the compiled FLIRT signature matcher.
|
|
||||||
vw (vivisect.workspace): the analyzed program's workspace.
|
|
||||||
va (int): the virtual address of a function to match.
|
|
||||||
|
|
||||||
returns:
|
|
||||||
Optional[str]: the recognized function name, or `None`.
|
|
||||||
"""
|
|
||||||
function_meta = vw.funcmeta.get(va)
|
|
||||||
if not function_meta:
|
|
||||||
# not a function, we're not going to consider this.
|
|
||||||
return None
|
|
||||||
|
|
||||||
if is_library_function(vw, va):
|
|
||||||
# already matched here.
|
|
||||||
# this might be the case if recursive matching visited this address.
|
|
||||||
return viv_utils.get_function_name(vw, va)
|
|
||||||
|
|
||||||
# 0x200 comes from:
|
|
||||||
# 0x20 bytes for default byte signature size in flirt
|
|
||||||
# 0x100 bytes for max checksum data size
|
|
||||||
# some wiggle room for tail bytes
|
|
||||||
size = function_meta.get("Size", 0x200)
|
|
||||||
# TODO: fix reads at the end of a section.
|
|
||||||
buf = vw.readMemory(va, size)
|
|
||||||
|
|
||||||
matches = []
|
|
||||||
for match in matcher.match(buf):
|
|
||||||
# collect all the name tuples (name, type, offset) with type==reference.
|
|
||||||
# ignores other name types like "public" and "local".
|
|
||||||
references = list(filter(lambda n: n[1] == "reference", match.names))
|
|
||||||
|
|
||||||
if not references:
|
|
||||||
# there are no references that we need to check, so this is a complete match.
|
|
||||||
# common case.
|
|
||||||
matches.append(match)
|
|
||||||
|
|
||||||
else:
|
|
||||||
# flirt uses reference names to assert that
|
|
||||||
# the function contains a reference to another function with a given name.
|
|
||||||
#
|
|
||||||
# we need to loop through these references,
|
|
||||||
# potentially recursively FLIRT match,
|
|
||||||
# and check the name matches (or doesn't).
|
|
||||||
|
|
||||||
# at the end of the following loop,
|
|
||||||
# if this flag is still true,
|
|
||||||
# then all the references have been validated.
|
|
||||||
does_match_references = True
|
|
||||||
|
|
||||||
#logger.debug("flirt: references needed for name %s for function at 0x%x: %s", get_match_name(match), va, references)
|
|
||||||
|
|
||||||
# when a reference is used to differentiate rule matches,
|
|
||||||
# then we can expect multiple rules to query the name of the same address.
|
|
||||||
# so, this caches the names looked up in the below loop.
|
|
||||||
# type: Map[int, str]
|
|
||||||
local_names = {}
|
|
||||||
for (ref_name, _, ref_offset) in references:
|
|
||||||
ref_va = va + ref_offset
|
|
||||||
|
|
||||||
# the reference offset may be inside an instruction,
|
|
||||||
# so we use getLocation to select the containing instruction address.
|
|
||||||
loc_va = vw.getLocation(ref_va)[vivisect.const.L_VA]
|
|
||||||
|
|
||||||
# an instruction may have multiple xrefs from
|
|
||||||
# so we loop through all code references,
|
|
||||||
# searching for that name.
|
|
||||||
#
|
|
||||||
# TODO: if we assume there is a single code reference, this is a lot easier.
|
|
||||||
# can we do that with FLIRT?
|
|
||||||
#
|
|
||||||
# if the name is found, then this flag will be set.
|
|
||||||
does_match_the_reference = False
|
|
||||||
for xref in vw.getXrefsFrom(loc_va):
|
|
||||||
# FLIRT signatures only match code,
|
|
||||||
# so we're only going to resolve references that point to code.
|
|
||||||
if xref[vivisect.const.XR_RTYPE] != vivisect.const.REF_CODE:
|
|
||||||
continue
|
|
||||||
|
|
||||||
target = xref[vivisect.const.XR_TO]
|
|
||||||
if target in local_names:
|
|
||||||
# fast path: a prior loop already looked up this address.
|
|
||||||
found_name = local_names[target]
|
|
||||||
else:
|
|
||||||
# this is a bit slower, since we have to read buf, do match, etc.
|
|
||||||
# note that we don't ever save "this is not a library function",
|
|
||||||
# so there's not an easy way to short circuit at the start of this function.
|
|
||||||
found_name = match_function_flirt_signatures(matcher, vw, target)
|
|
||||||
local_names[target] = found_name
|
|
||||||
|
|
||||||
#logger.debug("flirt: reference: 0x%x: 0x%x: wanted: %s found: %s", loc_va, target, ref_name, found_name)
|
|
||||||
|
|
||||||
if found_name == ref_name:
|
|
||||||
does_match_the_reference = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if not does_match_the_reference:
|
|
||||||
does_match_references = False
|
|
||||||
break
|
|
||||||
|
|
||||||
if does_match_references:
|
|
||||||
# only if all references pass do we count it.
|
|
||||||
matches.append(match)
|
|
||||||
|
|
||||||
if matches:
|
|
||||||
# we may have multiple signatures that match the same function, like `strcpy`.
|
|
||||||
# these could be copies from multiple libraries.
|
|
||||||
# so we don't mind if there are multiple matches, as long as names are the same.
|
|
||||||
#
|
|
||||||
# but if there are multiple candidate names, that's a problem.
|
|
||||||
# our signatures are not precise enough.
|
|
||||||
# we could maybe mark the function as "is a library function", but not assign name.
|
|
||||||
# though, if we have signature FPs among library functions, it could easily FP with user code too.
|
|
||||||
# so safest thing to do is not make any claim about the function.
|
|
||||||
names = list(set(map(get_match_name, matches)))
|
|
||||||
if len(names) == 1:
|
|
||||||
name = names[0]
|
|
||||||
add_function_flirt_match(vw, va, name)
|
|
||||||
logger.debug("flirt: found library function: 0x%x: %s", va, name)
|
|
||||||
return name
|
|
||||||
else:
|
|
||||||
logger.warning("flirt: conflicting names: 0x%x: %s", va, names)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def match_vw_flirt_signatures(matcher, vw):
|
|
||||||
"""
|
|
||||||
enumerate all functions in the workspace and match the given FLIRT signatures.
|
|
||||||
upon each success, update the workspace with match metadata, setting the
|
|
||||||
function as a library function and assigning its name.
|
|
||||||
|
|
||||||
if multiple different signatures match a function, don't do anything.
|
|
||||||
|
|
||||||
args:
|
|
||||||
match (flirt.FlirtMatcher): the compiled FLIRT signature matcher.
|
|
||||||
vw (vivisect.workspace): the analyzed program's workspace.
|
|
||||||
"""
|
|
||||||
for va in sorted(vw.getFunctions()):
|
|
||||||
match_function_flirt_signatures(matcher, vw, va)
|
|
||||||
@@ -7,6 +7,7 @@
|
|||||||
# See the License for the specific language governing permissions and limitations under the License.
|
# See the License for the specific language governing permissions and limitations under the License.
|
||||||
|
|
||||||
import viv_utils
|
import viv_utils
|
||||||
|
import viv_utils.flirt
|
||||||
import envi.memory
|
import envi.memory
|
||||||
import envi.archs.i386.disasm
|
import envi.archs.i386.disasm
|
||||||
|
|
||||||
@@ -112,7 +113,7 @@ def extract_insn_api_features(f, bb, insn):
|
|||||||
if not target:
|
if not target:
|
||||||
return
|
return
|
||||||
|
|
||||||
if capa.features.extractors.viv.is_library_function(f.vw, target):
|
if viv_utils.flirt.is_library_function(f.vw, target):
|
||||||
name = viv_utils.get_function_name(f.vw, target)
|
name = viv_utils.get_function_name(f.vw, target)
|
||||||
yield API(name), insn.va
|
yield API(name), insn.va
|
||||||
return
|
return
|
||||||
|
|||||||
1
setup.py
1
setup.py
@@ -30,6 +30,7 @@ if sys.version_info >= (3, 0):
|
|||||||
requirements.append("vivisect")
|
requirements.append("vivisect")
|
||||||
requirements.append("viv-utils==0.3.19")
|
requirements.append("viv-utils==0.3.19")
|
||||||
requirements.append("smda==1.5.13")
|
requirements.append("smda==1.5.13")
|
||||||
|
requirements.append("python-flirt~=0.5.4")
|
||||||
else:
|
else:
|
||||||
# py2
|
# py2
|
||||||
requirements.append("enum34==1.1.6") # v1.1.6 is needed by halo 0.0.30 / spinners 0.0.24
|
requirements.append("enum34==1.1.6") # v1.1.6 is needed by halo 0.0.30 / spinners 0.0.24
|
||||||
|
|||||||
Reference in New Issue
Block a user