capa/capa/features/extractors/ida/file.py

# Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.

import struct

import idc
import idaapi
import idautils

import capa.features.extractors.helpers
import capa.features.extractors.strings
import capa.features.extractors.ida.helpers
from capa.features import String, Characteristic
from capa.features.file import Export, Import, Section


def check_segment_for_pe(seg):
    """check segment for embedded PE

    adapted for IDA from:
    https://github.com/vivisect/vivisect/blob/7be4037b1cecc4551b397f840405a1fc606f9b53/PE/carve.py#L19

    args:
        seg (IDA segment_t)
    """
    seg_max = seg.end_ea
    mz_xor = [
        (
            capa.features.extractors.helpers.xor_static(b"MZ", i),
            capa.features.extractors.helpers.xor_static(b"PE", i),
            i,
        )
        for i in range(256)
    ]
    todo = [
        (capa.features.extractors.ida.helpers.find_byte_sequence(seg.start_ea, seg.end_ea, mzx), mzx, pex, i)
        for mzx, pex, i in mz_xor
    ]
    todo = [(off, mzx, pex, i) for (off, mzx, pex, i) in todo if off != idaapi.BADADDR]

    while len(todo):
        off, mzx, pex, i = todo.pop()

        # The MZ header has one field we will check e_lfanew is at 0x3c
        e_lfanew = off + 0x3C

        if seg_max < (e_lfanew + 4):
            continue

        newoff = struct.unpack("<I", capa.features.extractors.helpers.xor_static(idc.get_bytes(e_lfanew, 4), i))[0]

        peoff = off + newoff
        if seg_max < (peoff + 2):
            continue

        if idc.get_bytes(peoff, 2) == pex:
            yield (off, i)

        nextres = capa.features.extractors.ida.helpers.find_byte_sequence(off + 1, seg.end_ea, mzx)
        if nextres != -1:
            todo.append((nextres, mzx, pex, i))


def extract_file_embedded_pe():
    """extract embedded PE features

    IDA must load resource sections for this to be complete
        - '-R' from console
        - Check 'Load resource sections' when opening binary in IDA manually
    """
    for seg in capa.features.extractors.ida.helpers.get_segments(skip_header_segments=True):
        for (ea, _) in check_segment_for_pe(seg):
            yield Characteristic("embedded pe"), ea


def extract_file_export_names():
    """ extract function exports """
    for (_, _, ea, name) in idautils.Entries():
        yield Export(name), ea


def extract_file_import_names():
    """extract function imports

    1. imports by ordinal:
     - modulename.#ordinal

    2. imports by name, results in two features to support importname-only
       matching:
     - modulename.importname
     - importname
    """
    for (ea, info) in capa.features.extractors.ida.helpers.get_file_imports().items():
        if info[1]:
            yield Import("%s.%s" % (info[0], info[1])), ea
            yield Import(info[1]), ea
        if info[2]:
            yield Import("%s.#%s" % (info[0], str(info[2]))), ea


def extract_file_section_names():
    """extract section names

    IDA must load resource sections for this to be complete
        - '-R' from console
        - Check 'Load resource sections' when opening binary in IDA manually
    """
    for seg in capa.features.extractors.ida.helpers.get_segments(skip_header_segments=True):
        yield Section(idaapi.get_segm_name(seg)), seg.start_ea


def extract_file_strings():
    """extract ASCII and UTF-16 LE strings

    IDA must load resource sections for this to be complete
        - '-R' from console
        - Check 'Load resource sections' when opening binary in IDA manually
    """
    for seg in capa.features.extractors.ida.helpers.get_segments():
        seg_buff = capa.features.extractors.ida.helpers.get_segment_buffer(seg)

        for s in capa.features.extractors.strings.extract_ascii_strings(seg_buff):
            yield String(s.s), (seg.start_ea + s.offset)

        for s in capa.features.extractors.strings.extract_unicode_strings(seg_buff):
            yield String(s.s), (seg.start_ea + s.offset)


def extract_features():
    """ extract file features """
    for file_handler in FILE_HANDLERS:
        for feature, va in file_handler():
            yield feature, va


FILE_HANDLERS = (
    extract_file_export_names,
    extract_file_import_names,
    extract_file_strings,
    extract_file_section_names,
    extract_file_embedded_pe,
)


def main():
    """ """
    import pprint

    pprint.pprint(list(extract_features()))


if __name__ == "__main__":
    main()