fix: remove extract_file_format from FILE_HANDLERS in five extractors

Five extractors (ghidra, dnfile, viv, binja, ida) stored Format in
global_features during __init__ and also included extract_file_format
in FILE_HANDLERS. This caused find_file_capabilities to emit the Format
feature twice, inflating feature counts. Removing extract_file_format
from FILE_HANDLERS in all five extractors ensures Format is emitted
once via global_features only.
This commit is contained in:
Willi Ballenthin
2026-04-22 19:55:40 +03:00
committed by Willi Ballenthin
parent e2c8ab4bff
commit d32492d208
7 changed files with 23 additions and 7 deletions
+1
View File
@@ -50,6 +50,7 @@
- fix: correct wrong dict key in VMRay _compute_monitor_threads assertion (used thread_id instead of process_id) @williballenthin
fix: replace assert with isinstance guard in get_callee for invalid MethodSpec tokens @williballenthin
- fix: replace assert with isinstance guard in get_callee for invalid MethodSpec tokens @williballenthin
- fix: remove extract_file_format from FILE_HANDLERS in five extractors to prevent duplicate Format features @williballenthin (SURF-51)
- fix: replace assert with guard so 2-operand ARM ADD/SUB instructions are skipped instead of crashing @williballenthin (SURF-50)
- fix: omit trailing ` -> ` suffix in syscall names when there is no return value @williballenthin (SURF-49)
- fix: use AbsoluteVirtualAddress instead of FileOffsetAddress for string addresses in Ghidra and IDA file extractors @williballenthin (SURF-48)
+1 -2
View File
@@ -31,7 +31,7 @@ from capa.features.common import (
Feature,
Characteristic,
)
from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress
from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress, AbsoluteVirtualAddress
from capa.features.extractors.binja.helpers import read_c_string, unmangle_c_name
@@ -181,5 +181,4 @@ FILE_HANDLERS = (
extract_file_section_names,
extract_file_embedded_pe,
extract_file_function_names,
extract_file_format,
)
-1
View File
@@ -63,7 +63,6 @@ FILE_HANDLERS = (
extract_file_import_names,
extract_file_function_names,
extract_file_strings,
extract_file_format,
extract_file_mixed_mode_characteristic_features,
extract_file_namespace_features,
extract_file_class_features,
-1
View File
@@ -241,5 +241,4 @@ FILE_HANDLERS = (
extract_file_section_names,
extract_file_strings,
extract_file_function_names,
extract_file_format,
)
-1
View File
@@ -212,5 +212,4 @@ FILE_HANDLERS = (
extract_file_section_names,
extract_file_embedded_pe,
extract_file_function_names,
extract_file_format,
)
-1
View File
@@ -154,5 +154,4 @@ FILE_HANDLERS = (
extract_file_section_names,
extract_file_strings,
extract_file_function_names,
extract_file_format,
)
+21 -1
View File
@@ -19,9 +19,13 @@ import pytest
import fixtures
from dncil.clr.token import Token
from capa.features.common import Format
from capa.features.extractors.dnfile.insn import get_callee
from capa.features.extractors.dnfile.helpers import get_dotnet_table_row, calculate_dotnet_token_value
from capa.features.extractors.dnfile.extractor import DnFileFeatureExtractorCache
from capa.features.extractors.dnfile.extractor import (
DnfileFeatureExtractor,
DnFileFeatureExtractorCache,
)
CD = Path(__file__).resolve().parent
@@ -81,6 +85,22 @@ def test_get_dotnet_table_row_out_of_bounds():
assert get_dotnet_table_row(pe, dnfile.mdtable.TypeDef.number, len(table.rows) + 1) is None
def test_no_duplicate_format_feature_in_dnfile_extractor():
path = fixtures.DNFILE_TESTFILES / "hello-world" / "hello-world.exe"
if not path.exists():
pytest.skip("test data not available")
extractor = DnfileFeatureExtractor(path)
format_values = [
f.value
for f, _ in list(extractor.extract_file_features()) + list(extractor.extract_global_features())
if isinstance(f, Format)
]
assert len(format_values) == len(set(format_values)), f"duplicate Format features: {format_values}"
def test_get_callee_invalid_methodspec_token_returns_none():
path = CD / "data" / "2dae11cc5f86f5399b560b8837c26274b7e09431deed669b0844fef44e917915.exe_"
pe = dnfile.dnPE(str(path))