diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e644691..7bcf6c99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ - render: use default styling for dynamic -vv API/call details so they are easier to see @devs6186 #1865 - loader: handle struct.error from dnfile and show clear CorruptFile message @devs6186 #2442 - address: fix TypeError when sorting locations containing mixed address types @devs6186 #2195 +- loader: skip PE files with unrealistically large section virtual sizes to prevent resource exhaustion @devs6186 #1989 ### capa Explorer Web - webui: fix 404 for "View rule in capa-rules" by using encodeURIComponent for rule name in URL @devs6186 #2482 diff --git a/capa/loader.py b/capa/loader.py index d89d4c09..939680ab 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -126,6 +126,57 @@ def get_meta_str(vw): return f"{', '.join(meta)}, number of functions: {len(vw.getFunctions())}" +def _is_probably_corrupt_pe(path: Path) -> bool: + """ + Heuristic check for obviously malformed PE samples that provoke + pathological behavior in vivisect (see GH-1989). + + We treat a PE as "probably corrupt" when any section declares an + unrealistically large virtual size compared to the file size, e.g. + hundreds of megabytes in a tiny file. Such cases lead vivisect to + try to map enormous regions and can exhaust CPU/memory. + """ + try: + import pefile + except Exception: + # If pefile is unavailable, fall back to existing behavior. + return False + + try: + pe = pefile.PE(str(path), fast_load=True) + except pefile.PEFormatError: + # Not a PE file (or badly formed); let existing checks handle it. + return False + except Exception: + return False + + try: + file_size = path.stat().st_size + except OSError: + return False + + if file_size <= 0: + return False + + # Flag sections whose declared virtual size is wildly disproportionate + # to the file size (e.g. 900MB section in a ~400KB sample). + _VSIZE_FILE_RATIO = 128 + _MAX_REASONABLE_VSIZE = 512 * 1024 * 1024 # 512 MB + max_reasonable = max(file_size * _VSIZE_FILE_RATIO, _MAX_REASONABLE_VSIZE) + + for section in getattr(pe, "sections", []): + vsize = getattr(section, "Misc_VirtualSize", 0) or 0 + if vsize > max_reasonable: + logger.debug( + "detected unrealistic PE section virtual size: 0x%x (file size: 0x%x), treating as corrupt", + vsize, + file_size, + ) + return True + + return False + + def get_workspace(path: Path, input_format: str, sigpaths: list[Path]): """ load the program at the given path into a vivisect workspace using the given format. @@ -149,6 +200,12 @@ def get_workspace(path: Path, input_format: str, sigpaths: list[Path]): logger.debug("generating vivisect workspace for: %s", path) + if input_format in (FORMAT_PE, FORMAT_AUTO) and _is_probably_corrupt_pe(path): + raise CorruptFile( + "PE file appears to contain unrealistically large sections and is likely corrupt" + + " - skipping analysis to avoid excessive resource usage." + ) + try: if input_format == FORMAT_AUTO: if not is_supported_format(path): diff --git a/tests/test_loader_segfault.py b/tests/test_loader_segfault.py index 1ecb8a57..7d8dc20e 100644 --- a/tests/test_loader_segfault.py +++ b/tests/test_loader_segfault.py @@ -19,7 +19,7 @@ import pytest import envi.exc from capa.loader import CorruptFile, get_workspace -from capa.features.common import FORMAT_ELF +from capa.features.common import FORMAT_PE, FORMAT_ELF def test_segmentation_violation_handling(): @@ -38,3 +38,23 @@ def test_segmentation_violation_handling(): with pytest.raises(CorruptFile, match="Invalid memory access"): get_workspace(fake_path, FORMAT_ELF, []) + + +def test_corrupt_pe_with_unrealistic_section_size_short_circuits(): + """ + Test that a PE with an unrealistically large section virtual size + is caught early and raises CorruptFile before vivisect is invoked. + + See #1989. + """ + fake_path = Path("/tmp/fake_corrupt.exe") + + with ( + patch("capa.loader._is_probably_corrupt_pe", return_value=True), + patch("viv_utils.getWorkspace") as mock_workspace, + ): + with pytest.raises(CorruptFile, match="unrealistically large sections"): + get_workspace(fake_path, FORMAT_PE, []) + + # vivisect should never have been called + mock_workspace.assert_not_called()