fix: remove extract_file_format from FILE_HANDLERS in five extractors

Five extractors (ghidra, dnfile, viv, binja, ida) stored Format in
global_features during __init__ and also included extract_file_format
in FILE_HANDLERS. This caused find_file_capabilities to emit the Format
feature twice, inflating feature counts. Removing extract_file_format
from FILE_HANDLERS in all five extractors ensures Format is emitted
once via global_features only.
This commit is contained in:
Willi Ballenthin
2026-04-22 19:55:40 +03:00
committed by Willi Ballenthin
parent e2c8ab4bff
commit d32492d208
7 changed files with 23 additions and 7 deletions
+21 -1
View File
@@ -19,9 +19,13 @@ import pytest
import fixtures
from dncil.clr.token import Token
from capa.features.common import Format
from capa.features.extractors.dnfile.insn import get_callee
from capa.features.extractors.dnfile.helpers import get_dotnet_table_row, calculate_dotnet_token_value
from capa.features.extractors.dnfile.extractor import DnFileFeatureExtractorCache
from capa.features.extractors.dnfile.extractor import (
DnfileFeatureExtractor,
DnFileFeatureExtractorCache,
)
CD = Path(__file__).resolve().parent
@@ -81,6 +85,22 @@ def test_get_dotnet_table_row_out_of_bounds():
assert get_dotnet_table_row(pe, dnfile.mdtable.TypeDef.number, len(table.rows) + 1) is None
def test_no_duplicate_format_feature_in_dnfile_extractor():
path = fixtures.DNFILE_TESTFILES / "hello-world" / "hello-world.exe"
if not path.exists():
pytest.skip("test data not available")
extractor = DnfileFeatureExtractor(path)
format_values = [
f.value
for f, _ in list(extractor.extract_file_features()) + list(extractor.extract_global_features())
if isinstance(f, Format)
]
assert len(format_values) == len(set(format_values)), f"duplicate Format features: {format_values}"
def test_get_callee_invalid_methodspec_token_returns_none():
path = CD / "data" / "2dae11cc5f86f5399b560b8837c26274b7e09431deed669b0844fef44e917915.exe_"
pe = dnfile.dnPE(str(path))