From 79d94144c67a181ea1b655be4ec1216744e352ce Mon Sep 17 00:00:00 2001 From: Michael Hunhoff Date: Mon, 10 Aug 2020 15:01:16 -0600 Subject: [PATCH] adding IDA extractor code to resolve nested data references for string and bytes features --- capa/features/extractors/ida/helpers.py | 25 +++++++++++++++++++++++++ capa/features/extractors/ida/insn.py | 10 ++++------ tests/test_ida_features.py | 14 ++++++++++++++ 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/capa/features/extractors/ida/helpers.py b/capa/features/extractors/ida/helpers.py index eca84fd5..0b89ed4e 100644 --- a/capa/features/extractors/ida/helpers.py +++ b/capa/features/extractors/ida/helpers.py @@ -331,3 +331,28 @@ def is_basic_block_tight_loop(bb): if ref == bb.start_ea: return True return False + + +def find_data_reference_from_insn_helper(ea, max_depth=10): + """ recursive search for data reference, if exists, from instruction by resolving nested pointers, if exist """ + if 0 == max_depth: + # return when max depth reached + return ea + + data_refs = list(idautils.DataRefsFrom(ea)) + + if len(data_refs) != 1: + # return if no refs or more than one ref (assume nested pointers only have one data reference) + return ea + + if ea == data_refs[0]: + # return if circular reference + return ea + + # continue searching + return find_data_reference_from_insn_helper(data_refs[0], max_depth - 1) + + +def find_data_reference_from_insn(insn): + """ return address of data reference, if exists, otherwise address of instruction """ + return find_data_reference_from_insn_helper(insn.ea) diff --git a/capa/features/extractors/ida/insn.py b/capa/features/extractors/ida/insn.py index 225f2613..0a5276d8 100644 --- a/capa/features/extractors/ida/insn.py +++ b/capa/features/extractors/ida/insn.py @@ -116,11 +116,8 @@ def extract_insn_bytes_features(f, bb, insn): example: push offset iid_004118d4_IShellLinkA ; riid """ - if idaapi.is_call_insn(insn): - # ignore call instructions - return - - for ref in idautils.DataRefsFrom(insn.ea): + ref = capa.features.extractors.ida.helpers.find_data_reference_from_insn(insn) + if ref != insn.ea: extracted_bytes = capa.features.extractors.ida.helpers.read_bytes_at(ref, MAX_BYTES_FEATURE_SIZE) if extracted_bytes and not capa.features.extractors.helpers.all_zeros(extracted_bytes): yield Bytes(extracted_bytes), insn.ea @@ -137,7 +134,8 @@ def extract_insn_string_features(f, bb, insn): example: push offset aAcr ; "ACR > " """ - for ref in idautils.DataRefsFrom(insn.ea): + ref = capa.features.extractors.ida.helpers.find_data_reference_from_insn(insn) + if ref != insn.ea: found = capa.features.extractors.ida.helpers.find_string_at(ref) if found: yield String(found), insn.ea diff --git a/tests/test_ida_features.py b/tests/test_ida_features.py index e7afc738..c1e3f163 100644 --- a/tests/test_ida_features.py +++ b/tests/test_ida_features.py @@ -100,6 +100,13 @@ def test_string_features(): assert capa.features.String("bcrypt.dll") not in features +@pytest.mark.skip(reason="IDA Pro tests must be run within IDA") +def test_string_pointer_features(): + f = get_extractor().get_function(0x0044EDEF) + features = extract_function_features(f) + assert capa.features.String("INPUTEVENT") in features + + @pytest.mark.skip(reason="IDA Pro tests must be run within IDA") def test_byte_features(): f = get_extractor().get_function(0x40105D) @@ -109,6 +116,13 @@ def test_byte_features(): assert wanted.evaluate(features) == True +@pytest.mark.skip(reason="IDA Pro tests must be run within IDA") +def test_bytes_pointer_features(): + f = get_extractor().get_function(0x0044EDEF) + features = extract_function_features(f) + assert capa.features.Bytes("INPUTEVENT".encode("utf-16le")).evaluate(features) == True + + @pytest.mark.skip(reason="IDA Pro tests must be run within IDA") def test_number_features(): f = get_extractor().get_function(0x40105D)