extractors accept Path instance

This commit is contained in:
Aayush Goel
2023-07-11 00:41:36 +05:30
parent a949698b86
commit d1a1c6875b
11 changed files with 43 additions and 42 deletions

View File

@@ -6,6 +6,7 @@
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import subprocess
from pathlib import Path
# When the script gets executed as a standalone executable (via PyInstaller), `import binaryninja` does not work because
# we have excluded the binaryninja module in `pyinstaller.spec`. The trick here is to call the system Python and try
@@ -25,9 +26,9 @@ if spec is not None:
"""
def find_binja_path() -> str:
def find_binja_path() -> Path:
raw_output = subprocess.check_output(["python", "-c", code]).decode("ascii").strip()
return bytes.fromhex(raw_output).decode("utf8")
return Path(bytes.fromhex(raw_output).decode("utf8"))
if __name__ == "__main__":

View File

@@ -9,6 +9,7 @@
from __future__ import annotations
from typing import Dict, List, Tuple, Union, Iterator, Optional
from pathlib import Path
import dnfile
from dncil.cil.opcode import OpCodes
@@ -68,9 +69,9 @@ class DnFileFeatureExtractorCache:
class DnfileFeatureExtractor(FeatureExtractor):
def __init__(self, path: str):
def __init__(self, path: Path):
super().__init__()
self.pe: dnfile.dnPE = dnfile.dnPE(path)
self.pe: dnfile.dnPE = dnfile.dnPE(str(path))
# pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction
# most relevant at instruction scope

View File

@@ -1,5 +1,6 @@
import logging
from typing import Tuple, Iterator
from pathlib import Path
import dnfile
import pefile
@@ -74,10 +75,10 @@ GLOBAL_HANDLERS = (
class DnfileFeatureExtractor(FeatureExtractor):
def __init__(self, path: str):
def __init__(self, path: Path):
super().__init__()
self.path: str = path
self.pe: dnfile.dnPE = dnfile.dnPE(path)
self.path: Path = path
self.pe: dnfile.dnPE = dnfile.dnPE(str(path))
def get_base_address(self) -> AbsoluteVirtualAddress:
return AbsoluteVirtualAddress(0x0)

View File

@@ -1,5 +1,6 @@
import logging
from typing import Tuple, Iterator, cast
from pathlib import Path
import dnfile
import pefile
@@ -158,10 +159,10 @@ GLOBAL_HANDLERS = (
class DotnetFileFeatureExtractor(FeatureExtractor):
def __init__(self, path: str):
def __init__(self, path: Path):
super().__init__()
self.path: str = path
self.pe: dnfile.dnPE = dnfile.dnPE(path)
self.path: Path = path
self.pe: dnfile.dnPE = dnfile.dnPE(str(path))
def get_base_address(self):
return NO_ADDRESS

View File

@@ -8,6 +8,7 @@
import io
import logging
from typing import Tuple, Iterator
from pathlib import Path
from elftools.elf.elffile import ELFFile, SymbolTableSection
@@ -107,11 +108,10 @@ GLOBAL_HANDLERS = (
class ElfFeatureExtractor(FeatureExtractor):
def __init__(self, path: str):
def __init__(self, path: Path):
super().__init__()
self.path = path
with open(self.path, "rb") as f:
self.elf = ELFFile(io.BytesIO(f.read()))
self.path: Path = path
self.elf = ELFFile(io.BytesIO(path.read_bytes()))
def get_base_address(self):
# virtual address of the first segment with type LOAD
@@ -120,15 +120,13 @@ class ElfFeatureExtractor(FeatureExtractor):
return AbsoluteVirtualAddress(segment.header.p_vaddr)
def extract_global_features(self):
with open(self.path, "rb") as f:
buf = f.read()
buf = self.path.read_bytes()
for feature, addr in extract_global_features(self.elf, buf):
yield feature, addr
def extract_file_features(self):
with open(self.path, "rb") as f:
buf = f.read()
buf = self.path.read_bytes()
for feature, addr in extract_file_features(self.elf, buf):
yield feature, addr

View File

@@ -7,6 +7,7 @@
# See the License for the specific language governing permissions and limitations under the License.
import logging
from pathlib import Path
import pefile
@@ -173,23 +174,21 @@ GLOBAL_HANDLERS = (
class PefileFeatureExtractor(FeatureExtractor):
def __init__(self, path: str):
def __init__(self, path: Path):
super().__init__()
self.path = path
self.pe = pefile.PE(path)
self.path: Path = path
self.pe = pefile.PE(str(path))
def get_base_address(self):
return AbsoluteVirtualAddress(self.pe.OPTIONAL_HEADER.ImageBase)
def extract_global_features(self):
with open(self.path, "rb") as f:
buf = f.read()
buf = Path(self.path).read_bytes()
yield from extract_global_features(self.pe, buf)
def extract_file_features(self):
with open(self.path, "rb") as f:
buf = f.read()
buf = Path(self.path).read_bytes()
yield from extract_file_features(self.pe, buf)

View File

@@ -7,6 +7,7 @@
# See the License for the specific language governing permissions and limitations under the License.
import logging
from typing import Any, Dict, List, Tuple, Iterator
from pathlib import Path
import viv_utils
import viv_utils.flirt
@@ -25,12 +26,11 @@ logger = logging.getLogger(__name__)
class VivisectFeatureExtractor(FeatureExtractor):
def __init__(self, vw, path, os):
def __init__(self, vw, path: Path, os):
super().__init__()
self.vw = vw
self.path = path
with open(self.path, "rb") as f:
self.buf = f.read()
self.buf = path.read_bytes()
# pre-compute these because we'll yield them at *every* scope.
self.global_features: List[Tuple[Feature, Address]] = []

View File

@@ -78,7 +78,7 @@ def get_format(sample: Path) -> str:
for feature, _ in extract_format(buf):
if feature == Format(FORMAT_PE):
dnfile_extractor = DnfileFeatureExtractor(str(sample))
dnfile_extractor = DnfileFeatureExtractor(sample)
if dnfile_extractor.is_dotnet_file():
feature = Format(FORMAT_DOTNET)

View File

@@ -533,7 +533,7 @@ def get_extractor(
if format_ == FORMAT_DOTNET:
import capa.features.extractors.dnfile.extractor
return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(str(path))
return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path)
elif backend == BACKEND_BINJA:
from capa.features.extractors.binja.find_binja_api import find_binja_path
@@ -542,8 +542,8 @@ def get_extractor(
# We need to fist find the binja API installation path and add it into sys.path
if is_running_standalone():
bn_api = find_binja_path()
if Path(bn_api).exists():
sys.path.append(bn_api)
if bn_api.exists():
sys.path.append(str(bn_api))
try:
from binaryninja import BinaryView, BinaryViewType
@@ -586,14 +586,14 @@ def get_file_extractors(sample: Path, format_: str) -> List[FeatureExtractor]:
file_extractors: List[FeatureExtractor] = list()
if format_ == FORMAT_PE:
file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(str(sample)))
file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(sample))
elif format_ == FORMAT_DOTNET:
file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(str(sample)))
file_extractors.append(capa.features.extractors.dnfile_.DnfileFeatureExtractor(str(sample)))
file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(sample))
file_extractors.append(capa.features.extractors.dnfile_.DnfileFeatureExtractor(sample))
elif format_ == capa.features.extractors.common.FORMAT_ELF:
file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(str(sample)))
file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(sample))
return file_extractors

View File

@@ -129,7 +129,7 @@ def fixup_viv(path: Path, extractor):
def get_pefile_extractor(path: Path):
import capa.features.extractors.pefile
extractor = capa.features.extractors.pefile.PefileFeatureExtractor(str(path))
extractor = capa.features.extractors.pefile.PefileFeatureExtractor(path)
# overload the extractor so that the fixture exposes `extractor.path`
setattr(extractor, "path", path.as_posix())
@@ -140,7 +140,7 @@ def get_pefile_extractor(path: Path):
def get_dotnetfile_extractor(path: Path):
import capa.features.extractors.dotnetfile
extractor = capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(str(path))
extractor = capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(path)
# overload the extractor so that the fixture exposes `extractor.path`
setattr(extractor, "path", path.as_posix())
@@ -152,7 +152,7 @@ def get_dotnetfile_extractor(path: Path):
def get_dnfile_extractor(path: Path):
import capa.features.extractors.dnfile.extractor
extractor = capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(str(path))
extractor = capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path)
# overload the extractor so that the fixture exposes `extractor.path`
setattr(extractor, "path", path.as_posix())
@@ -232,7 +232,7 @@ def extract_instruction_features(extractor, fh, bbh, ih) -> Dict[Feature, Set[Ad
# note: to reduce the testing time it's recommended to reuse already existing test samples, if possible
def get_data_path_by_name(name):
def get_data_path_by_name(name) -> Path:
if name == "mimikatz":
return CD / "data" / "mimikatz.exe_"
elif name == "kernel32":
@@ -1048,7 +1048,6 @@ FEATURE_COUNT_TESTS_DOTNET = [
def do_test_feature_presence(get_extractor, sample, scope, feature, expected):
print(sample)
extractor = get_extractor(sample)
features = scope(extractor)
if expected:

View File

@@ -5,6 +5,8 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
from pathlib import PosixPath
import pytest
import fixtures
from fixtures import *
@@ -27,5 +29,4 @@ def test_pefile_features(sample, scope, feature, expected):
if ".elf" in sample.name:
pytest.xfail("pefile doesn't handle ELF files")
fixtures.do_test_feature_presence(fixtures.get_pefile_extractor, sample, scope, feature, expected)