From ec21f3b3fc2802fa067c48813e764b56f19c596a Mon Sep 17 00:00:00 2001 From: Mike Hunhoff Date: Thu, 20 Jun 2024 10:08:27 -0600 Subject: [PATCH] vmray: use xmltodict instead of pydantic_xml to improve performance --- capa/features/extractors/vmray/__init__.py | 4 +- capa/features/extractors/vmray/call.py | 1 - capa/features/extractors/vmray/extractor.py | 7 +- capa/features/extractors/vmray/models.py | 106 +++++--------------- requirements.txt | 2 +- 5 files changed, 32 insertions(+), 88 deletions(-) diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py index a0528278..3d2bf9d0 100644 --- a/capa/features/extractors/vmray/__init__.py +++ b/capa/features/extractors/vmray/__init__.py @@ -8,11 +8,11 @@ from typing import Dict from capa.exceptions import UnsupportedFormatError -from capa.features.extractors.vmray.models import File, Analysis, SummaryV2, StaticData +from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData class VMRayAnalysis: - def __init__(self, sv2: SummaryV2, flog: Analysis): + def __init__(self, sv2: SummaryV2, flog: Flog): self.sv2 = sv2 # logs/summary_v2.json self.flog = flog # logs/flog.xml self.exports: Dict[int, str] = {} diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py index e298c4ad..c4d117d2 100644 --- a/capa/features/extractors/vmray/call.py +++ b/capa/features/extractors/vmray/call.py @@ -5,7 +5,6 @@ from capa.helpers import assert_never from capa.features.insn import API, Number from capa.features.common import String, Feature from capa.features.address import Address -from capa.features.extractors.vmray.models import Analysis from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle logger = logging.getLogger(__name__) diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py index 19eb3b61..09450200 100644 --- a/capa/features/extractors/vmray/extractor.py +++ b/capa/features/extractors/vmray/extractor.py @@ -11,13 +11,15 @@ from typing import Tuple, Iterator from pathlib import Path from zipfile import ZipFile +import xmltodict + import capa.helpers import capa.features.extractors.vmray.file import capa.features.extractors.vmray.global_ from capa.features.common import Feature from capa.features.address import Address, AbsoluteVirtualAddress from capa.features.extractors.vmray import VMRayAnalysis -from capa.features.extractors.vmray.models import Process, Analysis, SummaryV2 +from capa.features.extractors.vmray.models import Flog, Process, SummaryV2 from capa.features.extractors.base_extractor import ( CallHandle, SampleHashes, @@ -95,6 +97,7 @@ class VMRayExtractor(DynamicFeatureExtractor): sv2 = SummaryV2.model_validate(sv2_json) flog_xml = zipfile.read("logs/flog.xml", pwd=b"infected") - flog = Analysis.from_xml(flog_xml) + flog_json = xmltodict.parse(flog_xml, attr_prefix="") + flog = Flog.model_validate(flog_json) return cls(VMRayAnalysis(sv2, flog)) diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py index dc668151..5e0ddd48 100644 --- a/capa/features/extractors/vmray/models.py +++ b/capa/features/extractors/vmray/models.py @@ -8,96 +8,38 @@ from typing import Dict, List, Optional -from pydantic import BaseModel -from pydantic_xml import BaseXmlModel, attr, element +from pydantic import Field, BaseModel -### models for flog.xml -class Param(BaseXmlModel, tag="param"): - name: str = attr() - type: str = attr() - value: Optional[str] = attr(default=None) +### models flog.xml files +class FunctionCall(BaseModel): + ts: str + fncall_id: str + process_id: str + thread_id: str + name: str + addr: str + from_addr: str = Field(alias="from") -# or see https://pydantic-xml.readthedocs.io/en/latest/pages/quickstart.html#wrapper -class In(BaseXmlModel, tag="in"): - params: List[Param] = element(name="in") +class FunctionReturn(BaseModel): + ts: str + fncall_id: str + addr: str + from_addr: str = Field(alias="from") -class Out(BaseXmlModel, tag="out"): - params: List[Param] = element(name="out") +class Analysis(BaseModel): + log_version: str + analyzer_version: str + analysis_date: str + + function_calls: List[FunctionCall] = Field(alias="fncall", default=[]) + function_returns: List[FunctionReturn] = Field(alias="fnret", default=[]) -class FunctionCall(BaseXmlModel, tag="fncall"): - ts: int = attr() - fncall_id: int = attr() - process_id: int = attr() - thread_id: int = attr() - name: str = attr() # API call name? - address: str = attr(name="addr") - from_: str = attr(name="from") - in_: Optional[In] = element(tag="in", default=None) - out_: Optional[Out] = element(tag="out", default=None) - - -# note that not all fncalls always have an associated fnret, e.g. exit or WaitForSingleObject -class FunctionReturn(BaseXmlModel, tag="fnret"): - ts: int = attr() - fncall_id: int = attr() - address: str = attr(name="addr") # string that contains a hex value - from_: str = attr(name="from") # string that contains a hex value - - -# TODO check multiple are there -class MonitorProcess(BaseXmlModel, tag="monitor_process"): - ts: int = attr() - process_id: int = attr() - image_name: str = attr() - - -# TODO check multiple are there -class MonitorThread(BaseXmlModel, tag="monitor_thread"): - ts: int = attr() - thread_id: int = attr() - process_id: int = attr() - os_tid: str = attr() # TODO hex - - -class NewRegion(BaseXmlModel, tag="new_region"): - ts: int = attr() - region_id: int = attr() - process_id: int = attr() - start_va: str = attr() - end_va: str = attr() - entry_point: str = attr() - - -class RemoveRegion(BaseXmlModel, tag="remove_region"): - ts: int = attr() - region_id: int = attr() - - -# unordered is very slow, but elements may occur in any order -class Analysis(BaseXmlModel, tag="analysis", search_mode="unordered"): - log_version: str = attr() - analyzer_version: str = attr() - analysis_date: str = attr() - - # super slow - # data: List[Union[MonitorProcess, MonitorThread, NewRegion, RemoveRegion, FunctionCall, FunctionReturn]] - - # may want to preprocess file and remove/reorder entries for more efficient parsing - - processes: List[MonitorProcess] = element(tag="monitor_process") - threads: List[MonitorThread] = element(tag="monitor_thread") - - # not important and slow down parsing - # new_regions: List[NewRegion] = element(tag="new_region") - # remove_regions: List[RemoveRegion] = element(tag="remove_region") - - # very slow alternative; calls: List[Union[FunctionCall, FunctionReturn]] - fncalls: List[FunctionCall] = element(tag="fncall") - fnrets: List[FunctionReturn] = element(tag="fnret") +class Flog(BaseModel): + analysis: Analysis ### models for summary_v2.json files diff --git a/requirements.txt b/requirements.txt index b2465502..f0a60408 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,7 +28,7 @@ pyasn1-modules==0.2.8 pycparser==2.22 pydantic==2.7.3 pydantic-core==2.18.4 -pydantic-xml==2.11.0 +xmltodict==0.13.0 pyelftools==0.31 pygments==2.18.0 python-flirt==0.8.10