fix: use AbsoluteVirtualAddress for string addresses in Ghidra and IDA file extractors

block.getStart().getOffset() and seg.start_ea both return virtual addresses,
not file offsets. Wrapping them in FileOffsetAddress was semantically wrong for
PE/ELF binaries where VA != file offset. Switch to AbsoluteVirtualAddress to
match what the value actually represents.
This commit is contained in:
Willi Ballenthin
2026-04-22 19:32:39 +03:00
committed by Willi Ballenthin
parent b348867e55
commit 52e8fdfc92
4 changed files with 14 additions and 14 deletions
+2 -1
View File
@@ -48,12 +48,13 @@
- fix: remove unreachable backports.functools_lru_cache fallback and dead dependency @williballenthin
- fix: Scopes.from_dict uses cls instead of self so subclasses return the correct type @williballenthin
- fix: correct wrong dict key in VMRay _compute_monitor_threads assertion (used thread_id instead of process_id) @williballenthin
- fix: replace assert with isinstance guard in get_callee for invalid MethodSpec tokens @williballenthin
- fix: use AbsoluteVirtualAddress instead of FileOffsetAddress for string addresses in Ghidra and IDA file extractors @williballenthin (SURF-48)
- fix: use dest.value.value and indirect_src.value.value for LLIL_CONST call destinations in binja insn.py @williballenthin (SURF-47)
- fix: remove duplicate getPrevLocation call and dead loc variable in get_previous_instructions @williballenthin (SURF-46)
- fix: unpack getByteDef offset and slice buffer so ENDBRANCH check applies to target address, not segment start @williballenthin (SURF-45)
- fix: correct inverted loop structure in extract_function_loop so each block edge is recorded as (src, dest) @williballenthin (SURF-44)
- fix: initialize addr to None in Ghidra import extractors to prevent UnboundLocalError when external functions have no data references @williballenthin (SURF-43)
- fix: replace assert with isinstance guard in get_callee for invalid MethodSpec tokens @williballenthin
- fix: assign ConfigDict to model_config in ConciseModel so extra="ignore" is actually applied @williballenthin (SURF-42)
- fix: replace assert with isinstance guard in get_callee for invalid MethodSpec tokens @williballenthin (SURF-41)
+3 -3
View File
@@ -31,7 +31,7 @@ from capa.features.common import (
Feature,
Characteristic,
)
from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress, AbsoluteVirtualAddress
from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress
from capa.features.extractors.binja.helpers import read_c_string, unmangle_c_name
@@ -46,7 +46,7 @@ def check_segment_for_pe(bv: BinaryView, seg: Segment) -> Iterator[tuple[Feature
buf = bv.read(seg.start, seg.length)
for offset, _ in capa.features.extractors.helpers.carve_pe(buf, start):
yield Characteristic("embedded pe"), FileOffsetAddress(seg.start + offset)
yield Characteristic("embedded pe"), AbsoluteVirtualAddress(seg.start + offset)
def extract_file_embedded_pe(bv: BinaryView) -> Iterator[tuple[Feature, Address]]:
@@ -122,7 +122,7 @@ def extract_file_section_names(bv: BinaryView) -> Iterator[tuple[Feature, Addres
def extract_file_strings(bv: BinaryView) -> Iterator[tuple[Feature, Address]]:
"""extract ASCII and UTF-16 LE strings"""
for s in bv.strings:
yield String(s.value), FileOffsetAddress(s.start)
yield String(s.value), AbsoluteVirtualAddress(s.start)
def extract_file_function_names(bv: BinaryView) -> Iterator[tuple[Feature, Address]]:
+5 -6
View File
@@ -85,14 +85,13 @@ def extract_file_embedded_pe() -> Iterator[tuple[Feature, Address]]:
continue
for off, _ in find_embedded_pe(capa.features.extractors.ghidra.helpers.get_block_bytes(block), mz_xor):
# add offset back to block start
ea_addr = block.getStart().add(off)
ea = ea_addr.getOffset()
f_offset = capa.features.extractors.ghidra.helpers.get_file_offset(ea_addr)
if f_offset != -1:
ea = f_offset
yield Characteristic("embedded pe"), FileOffsetAddress(ea)
yield Characteristic("embedded pe"), FileOffsetAddress(f_offset)
else:
yield Characteristic("embedded pe"), AbsoluteVirtualAddress(ea)
def extract_file_export_names() -> Iterator[tuple[Feature, Address]]:
@@ -187,11 +186,11 @@ def extract_file_strings() -> Iterator[tuple[Feature, Address]]:
for s in capa.features.extractors.strings.extract_ascii_strings(p_bytes):
offset = block.getStart().getOffset() + s.offset
yield String(s.s), FileOffsetAddress(offset)
yield String(s.s), AbsoluteVirtualAddress(offset)
for s in capa.features.extractors.strings.extract_unicode_strings(p_bytes):
offset = block.getStart().getOffset() + s.offset
yield String(s.s), FileOffsetAddress(offset)
yield String(s.s), AbsoluteVirtualAddress(offset)
def extract_file_function_names() -> Iterator[tuple[Feature, Address]]:
+4 -4
View File
@@ -28,7 +28,7 @@ import capa.features.extractors.strings
import capa.features.extractors.ida.helpers
from capa.features.file import Export, Import, Section, FunctionName
from capa.features.common import FORMAT_PE, FORMAT_ELF, Format, String, Feature, Characteristic
from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress, AbsoluteVirtualAddress
from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress
MAX_OFFSET_PE_AFTER_MZ = 0x200
@@ -87,7 +87,7 @@ def extract_file_embedded_pe() -> Iterator[tuple[Feature, Address]]:
"""
for seg in capa.features.extractors.ida.helpers.get_segments(skip_header_segments=True):
for ea, _ in check_segment_for_pe(seg):
yield Characteristic("embedded pe"), FileOffsetAddress(ea)
yield Characteristic("embedded pe"), AbsoluteVirtualAddress(ea)
def extract_file_export_names() -> Iterator[tuple[Feature, Address]]:
@@ -161,10 +161,10 @@ def extract_file_strings() -> Iterator[tuple[Feature, Address]]:
# differing to common string extractor factor in segment offset here
for s in capa.features.extractors.strings.extract_ascii_strings(seg_buff):
yield String(s.s), FileOffsetAddress(seg.start_ea + s.offset)
yield String(s.s), AbsoluteVirtualAddress(seg.start_ea + s.offset)
for s in capa.features.extractors.strings.extract_unicode_strings(seg_buff):
yield String(s.s), FileOffsetAddress(seg.start_ea + s.offset)
yield String(s.s), AbsoluteVirtualAddress(seg.start_ea + s.offset)
def extract_file_function_names() -> Iterator[tuple[Feature, Address]]: