diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4a9bf555..85b898b4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -51,7 +51,9 @@ jobs: - name: Upgrade pip, setuptools run: python -m pip install --upgrade pip setuptools - name: Install capa with build requirements - run: pip install -e .[build] + run: | + pip install -r requirements.txt + pip install -e .[build] - name: Build standalone executable run: pyinstaller --log-level DEBUG .github/pyinstaller/pyinstaller.spec - name: Does it run (PE)? diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index cb2a00f9..4a591d77 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -25,6 +25,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + pip install -r requirements.txt pip install -e .[build] - name: build package run: | diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 81252856..eb8ec1ce 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -35,7 +35,9 @@ jobs: with: python-version: "3.11" - name: Install dependencies - run: pip install -e .[dev] + run: | + pip install -r requirements.txt + pip install -e .[dev,scripts] - name: Lint with ruff run: pre-commit run ruff - name: Lint with isort @@ -61,7 +63,9 @@ jobs: with: python-version: "3.11" - name: Install capa - run: pip install -e .[dev] + run: | + pip install -r requirements.txt + pip install -e .[dev,scripts] - name: Run rule linter run: python scripts/lint.py rules/ @@ -96,7 +100,9 @@ jobs: if: matrix.os == 'ubuntu-20.04' run: sudo apt-get install -y libyaml-dev - name: Install capa - run: pip install -e .[dev] + run: | + pip install -r requirements.txt + pip install -e .[dev,scripts] - name: Run tests (fast) # this set of tests runs about 80% of the cases in 20% of the time, # and should catch most errors quickly. @@ -131,7 +137,9 @@ jobs: run: sudo apt-get install -y libyaml-dev - name: Install capa if: ${{ env.BN_SERIAL != 0 }} - run: pip install -e .[dev] + run: | + pip install -r requirements.txt + pip install -e .[dev,scripts] - name: install Binary Ninja if: ${{ env.BN_SERIAL != 0 }} run: | @@ -188,7 +196,9 @@ jobs: - name: Install pyyaml run: sudo apt-get install -y libyaml-dev - name: Install capa - run: pip install -e .[dev] + run: | + pip install -r requirements.txt + pip install -e .[dev,scripts] - name: Run tests run: | mkdir ./.github/ghidra/project diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d426768..2d333f51 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -110,6 +110,16 @@ repos: always_run: true pass_filenames: false +- repo: local + hooks: + - id: deptry + name: deptry + stages: [push, manual] + language: system + entry: deptry . + always_run: true + pass_filenames: false + - repo: local hooks: - id: pytest-fast @@ -128,12 +138,3 @@ repos: always_run: true pass_filenames: false -- repo: local - hooks: - - id: deptry - name: deptry - stages: [push, manual] - language: system - entry: deptry . - always_run: true - pass_filenames: false diff --git a/CHANGELOG.md b/CHANGELOG.md index 6eb87d99..5f3b36d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,20 +1,55 @@ # Change Log ## master (unreleased) -- Emit "dotnet" as format to ResultDocument when processing .NET files #2024 @samadpls ### New Features +### Breaking Changes + +### New Rules (0) + +- + +### Bug Fixes + +### capa explorer IDA Pro plugin + +### Development + +### Raw diffs +- [capa v7.1.0...master](https://github.com/mandiant/capa/compare/v7.1.0...master) +- [capa-rules v7.1.0...master](https://github.com/mandiant/capa-rules/compare/v7.1.0...master) + +## v7.1.0 +The v7.1.0 release brings large performance improvements to capa's rule matching engine. +Additionally, we've fixed various bugs and added new features for people using and developing capa. + +Special thanks to our repeat and new contributors: +* @sjha2048 made their first contribution in https://github.com/mandiant/capa/pull/2000 +* @Rohit1123 made their first contribution in https://github.com/mandiant/capa/pull/1990 +* @psahithireddy made their first contribution in https://github.com/mandiant/capa/pull/2020 +* @Atlas-64 made their first contribution in https://github.com/mandiant/capa/pull/2018 +* @s-ff made their first contribution in https://github.com/mandiant/capa/pull/2011 +* @samadpls made their first contribution in https://github.com/mandiant/capa/pull/2024 +* @acelynnzhang made their first contribution in https://github.com/mandiant/capa/pull/2044 +* @RainRat made their first contribution in https://github.com/mandiant/capa/pull/2058 +* @ReversingWithMe made their first contribution in https://github.com/mandiant/capa/pull/2093 +* @malwarefrank made their first contribution in https://github.com/mandiant/capa/pull/2037 + +### New Features +- Emit "dotnet" as format to ResultDocument when processing .NET files #2024 @samadpls +- ELF: detect OS from statically-linked Go binaries #1978 @williballenthin - add function in capa/helpers to load plain and compressed JSON reports #1883 @Rohit1123 - document Antivirus warnings and VirusTotal false positive detections #2028 @RionEV @mr-tz +- Add json to sarif conversion script @reversingwithme - render maec/* fields #843 @s-ff - replace Halo spinner with Rich #2086 @s-ff - optimize rule matching #2080 @williballenthin +- add aarch64 as a valid architecture #2144 mehunhoff@google.com @williballenthin +- relax dependency version requirements for the capa library #2053 @williballenthin +- add scripts dependency group and update documentation #2145 @mr-tz -### Breaking Changes - - -### New Rules (17) +### New Rules (25) - impact/wipe-disk/delete-drive-layout-via-ioctl william.ballenthin@mandiant.com - host-interaction/driver/interact-with-driver-via-ioctl moritz.raabe@mandiant.com @@ -33,7 +68,14 @@ - persistence/act-as-time-provider-dll jakub.jozwiak@mandiant.com - host-interaction/gui/window/hide/hide-graphical-window-from-taskbar jakub.jozwiak@mandiant.com - compiler/dart/compiled-with-dart jakub.jozwiak@mandiant.com -- +- nursery/bypass-hidden-api-restrictions-via-jni-on-android mehunhoff@google.com +- nursery/get-current-process-filesystem-mounts-on-linux mehunhoff@google.com +- nursery/get-current-process-memory-mapping-on-linux mehunhoff@google.com +- nursery/get-system-property-on-android mehunhoff@google.com +- nursery/hook-routines-via-lsplant mehunhoff@google.com +- nursery/load-packed-dex-via-jiagu-on-android mehunhoff@google.com +- nursery/modify-api-blacklist-or-denylist-via-jni-on-android mehunhoff@google.com +- nursery/truncate-file-on-linux mehunhoff@google.com ### Bug Fixes @@ -43,7 +85,6 @@ - cape: support more report formats #2035 @mr-tz - elf: extract import / export symbols from stripped binaries #2096 @ygasparis - ### capa explorer IDA Pro plugin - replace deprecated IDA API find_binary with bin_search #1606 @s-ff @@ -58,8 +99,8 @@ - add deptry support #1497 @s-ff ### Raw diffs -- [capa v7.0.1...master](https://github.com/mandiant/capa/compare/v7.0.1...master) -- [capa-rules v7.0.1...master](https://github.com/mandiant/capa-rules/compare/v7.0.1...master) +- [capa v7.0.1...v7.1.0](https://github.com/mandiant/capa/compare/v7.0.1...v7.1.0) +- [capa-rules v7.0.1...v7.1.0](https://github.com/mandiant/capa-rules/compare/v7.0.1...v7.1.0) ## v7.0.1 diff --git a/capa/features/common.py b/capa/features/common.py index c4b7df8e..cb938f29 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -409,9 +409,10 @@ class Bytes(Feature): # other candidates here: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#machine-types ARCH_I386 = "i386" ARCH_AMD64 = "amd64" +ARCH_AARCH64 = "aarch64" # dotnet ARCH_ANY = "any" -VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY) +VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_AARCH64, ARCH_ANY) class Arch(Feature): diff --git a/capa/features/extractors/dnfile/helpers.py b/capa/features/extractors/dnfile/helpers.py index 81156849..d7f4499e 100644 --- a/capa/features/extractors/dnfile/helpers.py +++ b/capa/features/extractors/dnfile/helpers.py @@ -83,7 +83,7 @@ def read_dotnet_user_string(pe: dnfile.dnPE, token: StringToken) -> Optional[str return None try: - user_string: Optional[dnfile.stream.UserString] = pe.net.user_strings.get_us(token.rid) + user_string: Optional[dnfile.stream.UserString] = pe.net.user_strings.get(token.rid) except UnicodeDecodeError as e: logger.debug("failed to decode #US stream index 0x%08x (%s)", token.rid, e) return None @@ -119,14 +119,14 @@ def get_dotnet_managed_imports(pe: dnfile.dnPE) -> Iterator[DnType]: access: Optional[str] # assume .NET imports starting with get_/set_ are used to access a property - if member_ref.Name.startswith("get_"): + member_ref_name: str = str(member_ref.Name) + if member_ref_name.startswith("get_"): access = FeatureAccess.READ - elif member_ref.Name.startswith("set_"): + elif member_ref_name.startswith("set_"): access = FeatureAccess.WRITE else: access = None - member_ref_name: str = member_ref.Name if member_ref_name.startswith(("get_", "set_")): # remove get_/set_ from MemberRef name member_ref_name = member_ref_name[4:] @@ -212,7 +212,7 @@ def get_dotnet_managed_methods(pe: dnfile.dnPE) -> Iterator[DnType]: token: int = calculate_dotnet_token_value(method.table.number, method.row_index) access: Optional[str] = accessor_map.get(token) - method_name: str = method.row.Name + method_name: str = str(method.row.Name) if method_name.startswith(("get_", "set_")): # remove get_/set_ method_name = method_name[4:] @@ -289,8 +289,8 @@ def get_dotnet_unmanaged_imports(pe: dnfile.dnPE) -> Iterator[DnUnmanagedMethod] logger.debug("ImplMap[0x%X] ImportScope row is None", rid) module = "" else: - module = impl_map.ImportScope.row.Name - method: str = impl_map.ImportName + module = str(impl_map.ImportScope.row.Name) + method: str = str(impl_map.ImportName) member_forward_table: int if impl_map.MemberForwarded.table is None: @@ -320,8 +320,11 @@ def get_dotnet_table_row(pe: dnfile.dnPE, table_index: int, row_index: int) -> O if row_index - 1 <= 0: return None + table: Optional[dnfile.base.ClrMetaDataTable] = pe.net.mdtables.tables.get(table_index) + if table is None: + return None + try: - table = pe.net.mdtables.tables.get(table_index, []) return table[row_index - 1] except IndexError: return None @@ -334,7 +337,7 @@ def resolve_nested_typedef_name( if index in nested_class_table: typedef_name = [] - name = typedef.TypeName + name = str(typedef.TypeName) # Append the current typedef name typedef_name.append(name) @@ -343,24 +346,24 @@ def resolve_nested_typedef_name( # Iterate through the typedef table to resolve the nested name table_row = get_dotnet_table_row(pe, dnfile.mdtable.TypeDef.number, nested_class_table[index]) if table_row is None: - return typedef.TypeNamespace, tuple(typedef_name[::-1]) + return str(typedef.TypeNamespace), tuple(typedef_name[::-1]) - name = table_row.TypeName + name = str(table_row.TypeName) typedef_name.append(name) index = nested_class_table[index] # Document the root enclosing details table_row = get_dotnet_table_row(pe, dnfile.mdtable.TypeDef.number, nested_class_table[index]) if table_row is None: - return typedef.TypeNamespace, tuple(typedef_name[::-1]) + return str(typedef.TypeNamespace), tuple(typedef_name[::-1]) - enclosing_name = table_row.TypeName + enclosing_name = str(table_row.TypeName) typedef_name.append(enclosing_name) - return table_row.TypeNamespace, tuple(typedef_name[::-1]) + return str(table_row.TypeNamespace), tuple(typedef_name[::-1]) else: - return typedef.TypeNamespace, (typedef.TypeName,) + return str(typedef.TypeNamespace), (str(typedef.TypeName),) def resolve_nested_typeref_name( @@ -370,29 +373,29 @@ def resolve_nested_typeref_name( # If the ResolutionScope decodes to a typeRef type then it is nested if isinstance(typeref.ResolutionScope.table, dnfile.mdtable.TypeRef): typeref_name = [] - name = typeref.TypeName + name = str(typeref.TypeName) # Not appending the current typeref name to avoid potential duplicate # Validate index table_row = get_dotnet_table_row(pe, dnfile.mdtable.TypeRef.number, index) if table_row is None: - return typeref.TypeNamespace, (typeref.TypeName,) + return str(typeref.TypeNamespace), (str(typeref.TypeName),) while isinstance(table_row.ResolutionScope.table, dnfile.mdtable.TypeRef): # Iterate through the typeref table to resolve the nested name typeref_name.append(name) - name = table_row.TypeName + name = str(table_row.TypeName) table_row = get_dotnet_table_row(pe, dnfile.mdtable.TypeRef.number, table_row.ResolutionScope.row_index) if table_row is None: - return typeref.TypeNamespace, tuple(typeref_name[::-1]) + return str(typeref.TypeNamespace), tuple(typeref_name[::-1]) # Document the root enclosing details - typeref_name.append(table_row.TypeName) + typeref_name.append(str(table_row.TypeName)) - return table_row.TypeNamespace, tuple(typeref_name[::-1]) + return str(table_row.TypeNamespace), tuple(typeref_name[::-1]) else: - return typeref.TypeNamespace, (typeref.TypeName,) + return str(typeref.TypeNamespace), (str(typeref.TypeName),) def get_dotnet_nested_class_table_index(pe: dnfile.dnPE) -> Dict[int, int]: diff --git a/capa/features/extractors/dotnetfile.py b/capa/features/extractors/dotnetfile.py index 75bf32dc..5ab99857 100644 --- a/capa/features/extractors/dotnetfile.py +++ b/capa/features/extractors/dotnetfile.py @@ -78,12 +78,12 @@ def extract_file_namespace_features(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple for _, typedef in iter_dotnet_table(pe, dnfile.mdtable.TypeDef.number): # emit internal .NET namespaces assert isinstance(typedef, dnfile.mdtable.TypeDefRow) - namespaces.add(typedef.TypeNamespace) + namespaces.add(str(typedef.TypeNamespace)) for _, typeref in iter_dotnet_table(pe, dnfile.mdtable.TypeRef.number): # emit external .NET namespaces assert isinstance(typeref, dnfile.mdtable.TypeRefRow) - namespaces.add(typeref.TypeNamespace) + namespaces.add(str(typeref.TypeNamespace)) # namespaces may be empty, discard namespaces.discard("") diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py index e43332e6..0cbabda1 100644 --- a/capa/features/extractors/elf.py +++ b/capa/features/extractors/elf.py @@ -58,6 +58,10 @@ class OS(str, Enum): SYLLABLE = "syllable" NACL = "nacl" ANDROID = "android" + DRAGONFLYBSD = "dragonfly BSD" + ILLUMOS = "illumos" + ZOS = "z/os" + UNIX = "unix" # via readelf: https://github.com/bminor/binutils-gdb/blob/c0e94211e1ac05049a4ce7c192c9d14d1764eb3e/binutils/readelf.c#L19635-L19658 @@ -81,6 +85,8 @@ class Phdr: paddr: int filesz: int buf: bytes + flags: int + memsz: int @dataclass @@ -315,24 +321,23 @@ class ELF: phent_offset = i * self.e_phentsize phent = self.phbuf[phent_offset : phent_offset + self.e_phentsize] - (p_type,) = struct.unpack_from(self.endian + "I", phent, 0x0) - logger.debug("ph:p_type: 0x%04x", p_type) - if self.bitness == 32: - p_offset, p_vaddr, p_paddr, p_filesz = struct.unpack_from(self.endian + "IIII", phent, 0x4) + p_type, p_offset, p_vaddr, p_paddr, p_filesz, p_memsz, p_flags = struct.unpack_from( + self.endian + "IIIIIII", phent, 0x0 + ) elif self.bitness == 64: - p_offset, p_vaddr, p_paddr, p_filesz = struct.unpack_from(self.endian + "QQQQ", phent, 0x8) + p_type, p_flags, p_offset, p_vaddr, p_paddr, p_filesz, p_memsz = struct.unpack_from( + self.endian + "IIQQQQQ", phent, 0x0 + ) else: raise NotImplementedError() - logger.debug("ph:p_offset: 0x%02x p_filesz: 0x%04x", p_offset, p_filesz) - self.f.seek(p_offset) buf = self.f.read(p_filesz) if len(buf) != p_filesz: raise ValueError("failed to read program header content") - return Phdr(p_type, p_offset, p_vaddr, p_paddr, p_filesz, buf) + return Phdr(p_type, p_offset, p_vaddr, p_paddr, p_filesz, buf, p_flags, p_memsz) @property def program_headers(self): @@ -357,8 +362,6 @@ class ELF: else: raise NotImplementedError() - logger.debug("sh:sh_offset: 0x%02x sh_size: 0x%04x", sh_offset, sh_size) - self.f.seek(sh_offset) buf = self.f.read(sh_size) if len(buf) != sh_size: @@ -867,6 +870,8 @@ def guess_os_from_ident_directive(elf: ELF) -> Optional[OS]: return OS.LINUX elif "Red Hat" in comment: return OS.LINUX + elif "Alpine" in comment: + return OS.LINUX elif "Android" in comment: return OS.ANDROID @@ -952,11 +957,506 @@ def guess_os_from_symtab(elf: ELF) -> Optional[OS]: for os, hints in keywords.items(): if any(hint in sym_name for hint in hints): + logger.debug("symtab: %s looks like %s", sym_name, os) return os return None +def is_go_binary(elf: ELF) -> bool: + for shdr in elf.section_headers: + if shdr.get_name(elf) == ".note.go.buildid": + logger.debug("go buildinfo: found section .note.go.buildid") + return True + + # The `go version` command enumerates sections for the name `.go.buildinfo` + # (in addition to looking for the BUILDINFO_MAGIC) to check if an executable is go or not. + # See references to the `errNotGoExe` error here: + # https://github.com/golang/go/blob/master/src/debug/buildinfo/buildinfo.go#L41 + for shdr in elf.section_headers: + if shdr.get_name(elf) == ".go.buildinfo": + logger.debug("go buildinfo: found section .go.buildinfo") + return True + + # other strategy used by FLOSS: search for known runtime strings. + # https://github.com/mandiant/flare-floss/blob/b2ca8adfc5edf278861dd6bff67d73da39683b46/floss/language/identify.py#L88 + return False + + +def get_go_buildinfo_data(elf: ELF) -> Optional[bytes]: + for shdr in elf.section_headers: + if shdr.get_name(elf) == ".go.buildinfo": + logger.debug("go buildinfo: found section .go.buildinfo") + return shdr.buf + + PT_LOAD = 0x1 + PF_X = 1 + PF_W = 2 + for phdr in elf.program_headers: + if phdr.type != PT_LOAD: + continue + + if (phdr.flags & (PF_X | PF_W)) == PF_W: + logger.debug("go buildinfo: found data segment") + return phdr.buf + + return None + + +def read_data(elf: ELF, rva: int, size: int) -> Optional[bytes]: + # ELF segments are for runtime data, + # ELF sections are for link-time data. + # So we want to read Program Headers/Segments. + for phdr in elf.program_headers: + if phdr.vaddr <= rva < phdr.vaddr + phdr.memsz: + segment_data = phdr.buf + + # pad the section with NULLs + # assume page alignment is already handled. + # might need more hardening here. + if len(segment_data) < phdr.memsz: + segment_data += b"\x00" * (phdr.memsz - len(segment_data)) + + segment_offset = rva - phdr.vaddr + return segment_data[segment_offset : segment_offset + size] + + return None + + +def read_go_slice(elf: ELF, rva: int) -> Optional[bytes]: + if elf.bitness == 32: + struct_size = 8 + struct_format = elf.endian + "II" + elif elf.bitness == 64: + struct_size = 16 + struct_format = elf.endian + "QQ" + else: + raise ValueError("invalid psize") + + struct_buf = read_data(elf, rva, struct_size) + if not struct_buf: + return None + + addr, length = struct.unpack_from(struct_format, struct_buf, 0) + + return read_data(elf, addr, length) + + +def guess_os_from_go_buildinfo(elf: ELF) -> Optional[OS]: + """ + In a binary compiled by Go, the buildinfo structure may contain + metadata about the build environment, including the configured + GOOS, which specifies the target operating system. + + Search for and parse the buildinfo structure, + which may be found in the .go.buildinfo section, + and often contains this metadata inline. Otherwise, + follow a few byte slices to the relevant information. + + This strategy is derived from GoReSym. + """ + buf = get_go_buildinfo_data(elf) + if not buf: + logger.debug("go buildinfo: no buildinfo section") + return None + + assert isinstance(buf, bytes) + + # The build info blob left by the linker is identified by + # a 16-byte header, consisting of: + # - buildInfoMagic (14 bytes), + # - the binary's pointer size (1 byte), and + # - whether the binary is big endian (1 byte). + # + # Then: + # - virtual address to Go string: runtime.buildVersion + # - virtual address to Go string: runtime.modinfo + # + # On 32-bit platforms, the last 8 bytes are unused. + # + # If the endianness has the 2 bit set, then the pointers are zero, + # and the 32-byte header is followed by varint-prefixed string data + # for the two string values we care about. + # https://github.com/mandiant/GoReSym/blob/0860a1b1b4f3495e9fb7e71eb4386bf3e0a7c500/buildinfo/buildinfo.go#L185-L193 + BUILDINFO_MAGIC = b"\xFF Go buildinf:" + + try: + index = buf.index(BUILDINFO_MAGIC) + except ValueError: + logger.debug("go buildinfo: no buildinfo magic") + return None + + psize, flags = struct.unpack_from("II", + # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 08 01} + # like: d44ba497964050c0e3dd2a192c511e4c3c4f17717f0322a554d64b797ee4690a + # in which the modinfo doesn't have GOOS. + (8, True): ">QQ", + } + + build_version_address, modinfo_address = struct.unpack_from( + info_format[(psize, is_big_endian)], buf, index + 0x10 + ) + logger.debug("go buildinfo: build version address: 0x%x", build_version_address) + logger.debug("go buildinfo: modinfo address: 0x%x", modinfo_address) + + build_version = read_go_slice(elf, build_version_address) + if build_version: + logger.debug("go buildinfo: build version: %s", build_version.decode("utf-8")) + + modinfo = read_go_slice(elf, modinfo_address) + if modinfo: + if modinfo[-0x11] == ord("\n"): + # Strip module framing: sentinel strings delimiting the module info. + # These are cmd/go/internal/modload/build.infoStart and infoEnd. + # Which should probably be: + # infoStart, _ = hex.DecodeString("3077af0c9274080241e1c107e6d618e6") + # infoEnd, _ = hex.DecodeString("f932433186182072008242104116d8f2") + modinfo = modinfo[0x10:-0x10] + logger.debug("go buildinfo: modinfo: %s", modinfo.decode("utf-8")) + + if not modinfo: + return None + + for key, os in GOOS_TO_OS.items(): + # Brute force the k-v pair, like `GOOS=linux`, + # rather than try to parse the data, which would be fragile. + if (b"GOOS=" + key) in modinfo: + logger.debug("go buildinfo: found os: %s", os) + return os + + return None + + +def guess_os_from_go_source(elf: ELF) -> Optional[OS]: + """ + In a binary compiled by Go, runtime metadata may contain + references to the source filenames, including the + src/runtime/os_* files, whose name indicates the + target operating system. + + Confirm the given ELF seems to be built by Go, + and then look for strings that look like + Go source filenames. + + This strategy is derived from GoReSym. + """ + if not is_go_binary(elf): + return None + + for phdr in elf.program_headers: + buf = phdr.buf + NEEDLE_OS = b"/src/runtime/os_" + try: + index = buf.index(NEEDLE_OS) + except ValueError: + continue + + rest = buf[index + len(NEEDLE_OS) : index + len(NEEDLE_OS) + 32] + filename = rest.partition(b".go")[0].decode("utf-8") + logger.debug("go source: filename: /src/runtime/os_%s.go", filename) + + # via: https://cs.opensource.google/go/go/+/master:src/runtime/;bpv=1;bpt=0 + # candidates today: + # - aix + # - android + # - darwin + # - darwin_arm64 + # - dragonfly + # - freebsd + # - freebsd2 + # - freebsd_amd64 + # - freebsd_arm + # - freebsd_arm64 + # - freebsd_noauxv + # - freebsd_riscv64 + # - illumos + # - js + # - linux + # - linux_arm + # - linux_arm64 + # - linux_be64 + # - linux_generic + # - linux_loong64 + # - linux_mips64x + # - linux_mipsx + # - linux_noauxv + # - linux_novdso + # - linux_ppc64x + # - linux_riscv64 + # - linux_s390x + # - linux_x86 + # - netbsd + # - netbsd_386 + # - netbsd_amd64 + # - netbsd_arm + # - netbsd_arm64 + # - nonopenbsd + # - only_solaris + # - openbsd + # - openbsd_arm + # - openbsd_arm64 + # - openbsd_libc + # - openbsd_mips64 + # - openbsd_syscall + # - openbsd_syscall1 + # - openbsd_syscall2 + # - plan9 + # - plan9_arm + # - solaris + # - unix + # - unix_nonlinux + # - wasip1 + # - wasm + # - windows + # - windows_arm + # - windows_arm64 + + OS_FILENAME_TO_OS = { + "aix": OS.AIX, + "android": OS.ANDROID, + "dragonfly": OS.DRAGONFLYBSD, + "freebsd": OS.FREEBSD, + "freebsd2": OS.FREEBSD, + "freebsd_": OS.FREEBSD, + "illumos": OS.ILLUMOS, + "linux": OS.LINUX, + "netbsd": OS.NETBSD, + "only_solaris": OS.SOLARIS, + "openbsd": OS.OPENBSD, + "solaris": OS.SOLARIS, + "unix_nonlinux": OS.UNIX, + } + + for prefix, os in OS_FILENAME_TO_OS.items(): + if filename.startswith(prefix): + return os + + for phdr in elf.program_headers: + buf = phdr.buf + NEEDLE_RT0 = b"/src/runtime/rt0_" + try: + index = buf.index(NEEDLE_RT0) + except ValueError: + continue + + rest = buf[index + len(NEEDLE_RT0) : index + len(NEEDLE_RT0) + 32] + filename = rest.partition(b".s")[0].decode("utf-8") + logger.debug("go source: filename: /src/runtime/rt0_%s.s", filename) + + # via: https://cs.opensource.google/go/go/+/master:src/runtime/;bpv=1;bpt=0 + # candidates today: + # - aix_ppc64 + # - android_386 + # - android_amd64 + # - android_arm + # - android_arm64 + # - darwin_amd64 + # - darwin_arm64 + # - dragonfly_amd64 + # - freebsd_386 + # - freebsd_amd64 + # - freebsd_arm + # - freebsd_arm64 + # - freebsd_riscv64 + # - illumos_amd64 + # - ios_amd64 + # - ios_arm64 + # - js_wasm + # - linux_386 + # - linux_amd64 + # - linux_arm + # - linux_arm64 + # - linux_loong64 + # - linux_mips64x + # - linux_mipsx + # - linux_ppc64 + # - linux_ppc64le + # - linux_riscv64 + # - linux_s390x + # - netbsd_386 + # - netbsd_amd64 + # - netbsd_arm + # - netbsd_arm64 + # - openbsd_386 + # - openbsd_amd64 + # - openbsd_arm + # - openbsd_arm64 + # - openbsd_mips64 + # - openbsd_ppc64 + # - openbsd_riscv64 + # - plan9_386 + # - plan9_amd64 + # - plan9_arm + # - solaris_amd64 + # - wasip1_wasm + # - windows_386 + # - windows_amd64 + # - windows_arm + # - windows_arm64 + + RT0_FILENAME_TO_OS = { + "aix": OS.AIX, + "android": OS.ANDROID, + "dragonfly": OS.DRAGONFLYBSD, + "freebsd": OS.FREEBSD, + "illumos": OS.ILLUMOS, + "linux": OS.LINUX, + "netbsd": OS.NETBSD, + "openbsd": OS.OPENBSD, + "solaris": OS.SOLARIS, + } + + for prefix, os in RT0_FILENAME_TO_OS.items(): + if filename.startswith(prefix): + return os + + return None + + +def guess_os_from_vdso_strings(elf: ELF) -> Optional[OS]: + """ + The "vDSO" (virtual dynamic shared object) is a small shared + library that the kernel automatically maps into the address space + of all user-space applications. + + Some statically linked executables include small dynamic linker + routines that finds these vDSO symbols, using the ASCII + symbol name and version. We can therefore recognize the pairs + (symbol, version) to guess the binary targets Linux. + """ + for phdr in elf.program_headers: + buf = phdr.buf + + # We don't really use the arch, but its interesting for documentation + # I suppose we could restrict the arch here to what's in the ELF header, + # but that's even more work. Let's see if this is sufficient. + for arch, symbol, version in ( + # via: https://man7.org/linux/man-pages/man7/vdso.7.html + ("arm", b"__vdso_gettimeofday", b"LINUX_2.6"), + ("arm", b"__vdso_clock_gettime", b"LINUX_2.6"), + ("aarch64", b"__kernel_rt_sigreturn", b"LINUX_2.6.39"), + ("aarch64", b"__kernel_gettimeofday", b"LINUX_2.6.39"), + ("aarch64", b"__kernel_clock_gettime", b"LINUX_2.6.39"), + ("aarch64", b"__kernel_clock_getres", b"LINUX_2.6.39"), + ("mips", b"__kernel_gettimeofday", b"LINUX_2.6"), + ("mips", b"__kernel_clock_gettime", b"LINUX_2.6"), + ("ia64", b"__kernel_sigtramp", b"LINUX_2.5"), + ("ia64", b"__kernel_syscall_via_break", b"LINUX_2.5"), + ("ia64", b"__kernel_syscall_via_epc", b"LINUX_2.5"), + ("ppc/32", b"__kernel_clock_getres", b"LINUX_2.6.15"), + ("ppc/32", b"__kernel_clock_gettime", b"LINUX_2.6.15"), + ("ppc/32", b"__kernel_clock_gettime64", b"LINUX_5.11"), + ("ppc/32", b"__kernel_datapage_offset", b"LINUX_2.6.15"), + ("ppc/32", b"__kernel_get_syscall_map", b"LINUX_2.6.15"), + ("ppc/32", b"__kernel_get_tbfreq", b"LINUX_2.6.15"), + ("ppc/32", b"__kernel_getcpu", b"LINUX_2.6.15"), + ("ppc/32", b"__kernel_gettimeofday", b"LINUX_2.6.15"), + ("ppc/32", b"__kernel_sigtramp_rt32", b"LINUX_2.6.15"), + ("ppc/32", b"__kernel_sigtramp32", b"LINUX_2.6.15"), + ("ppc/32", b"__kernel_sync_dicache", b"LINUX_2.6.15"), + ("ppc/32", b"__kernel_sync_dicache_p5", b"LINUX_2.6.15"), + ("ppc/64", b"__kernel_clock_getres", b"LINUX_2.6.15"), + ("ppc/64", b"__kernel_clock_gettime", b"LINUX_2.6.15"), + ("ppc/64", b"__kernel_datapage_offset", b"LINUX_2.6.15"), + ("ppc/64", b"__kernel_get_syscall_map", b"LINUX_2.6.15"), + ("ppc/64", b"__kernel_get_tbfreq", b"LINUX_2.6.15"), + ("ppc/64", b"__kernel_getcpu", b"LINUX_2.6.15"), + ("ppc/64", b"__kernel_gettimeofday", b"LINUX_2.6.15"), + ("ppc/64", b"__kernel_sigtramp_rt64", b"LINUX_2.6.15"), + ("ppc/64", b"__kernel_sync_dicache", b"LINUX_2.6.15"), + ("ppc/64", b"__kernel_sync_dicache_p5", b"LINUX_2.6.15"), + ("riscv", b"__vdso_rt_sigreturn", b"LINUX_4.15"), + ("riscv", b"__vdso_gettimeofday", b"LINUX_4.15"), + ("riscv", b"__vdso_clock_gettime", b"LINUX_4.15"), + ("riscv", b"__vdso_clock_getres", b"LINUX_4.15"), + ("riscv", b"__vdso_getcpu", b"LINUX_4.15"), + ("riscv", b"__vdso_flush_icache", b"LINUX_4.15"), + ("s390", b"__kernel_clock_getres", b"LINUX_2.6.29"), + ("s390", b"__kernel_clock_gettime", b"LINUX_2.6.29"), + ("s390", b"__kernel_gettimeofday", b"LINUX_2.6.29"), + ("superh", b"__kernel_rt_sigreturn", b"LINUX_2.6"), + ("superh", b"__kernel_sigreturn", b"LINUX_2.6"), + ("superh", b"__kernel_vsyscall", b"LINUX_2.6"), + ("i386", b"__kernel_sigreturn", b"LINUX_2.5"), + ("i386", b"__kernel_rt_sigreturn", b"LINUX_2.5"), + ("i386", b"__kernel_vsyscall", b"LINUX_2.5"), + ("i386", b"__vdso_clock_gettime", b"LINUX_2.6"), + ("i386", b"__vdso_gettimeofday", b"LINUX_2.6"), + ("i386", b"__vdso_time", b"LINUX_2.6"), + ("x86-64", b"__vdso_clock_gettime", b"LINUX_2.6"), + ("x86-64", b"__vdso_getcpu", b"LINUX_2.6"), + ("x86-64", b"__vdso_gettimeofday", b"LINUX_2.6"), + ("x86-64", b"__vdso_time", b"LINUX_2.6"), + ("x86/32", b"__vdso_clock_gettime", b"LINUX_2.6"), + ("x86/32", b"__vdso_getcpu", b"LINUX_2.6"), + ("x86/32", b"__vdso_gettimeofday", b"LINUX_2.6"), + ("x86/32", b"__vdso_time", b"LINUX_2.6"), + ): + if symbol in buf and version in buf: + logger.debug("vdso string: %s %s %s", arch, symbol.decode("ascii"), version.decode("ascii")) + return OS.LINUX + + return None + + def detect_elf_os(f) -> str: """ f: type Union[BinaryIO, IDAIO, GHIDRAIO] @@ -1023,6 +1523,27 @@ def detect_elf_os(f) -> str: logger.warning("Error guessing OS from symbol table: %s", e) symtab_guess = None + try: + goos_guess = guess_os_from_go_buildinfo(elf) + logger.debug("guess: Go buildinfo: %s", goos_guess) + except Exception as e: + logger.warning("Error guessing OS from Go buildinfo: %s", e) + goos_guess = None + + try: + gosrc_guess = guess_os_from_go_source(elf) + logger.debug("guess: Go source: %s", gosrc_guess) + except Exception as e: + logger.warning("Error guessing OS from Go source path: %s", e) + gosrc_guess = None + + try: + vdso_guess = guess_os_from_vdso_strings(elf) + logger.debug("guess: vdso strings: %s", vdso_guess) + except Exception as e: + logger.warning("Error guessing OS from vdso strings: %s", e) + symtab_guess = None + ret = None if osabi_guess: @@ -1046,11 +1567,24 @@ def detect_elf_os(f) -> str: elif symtab_guess: ret = symtab_guess + elif goos_guess: + ret = goos_guess + + elif gosrc_guess: + # prefer goos_guess to this method, + # which is just string interpretation. + ret = gosrc_guess + elif ident_guess: # at the bottom because we don't trust this too much # due to potential for bugs with cross-compilation. ret = ident_guess + elif vdso_guess: + # at the bottom because this is just scanning strings, + # which isn't very authoritative. + ret = vdso_guess + return ret.value if ret is not None else "unknown" diff --git a/capa/loader.py b/capa/loader.py index 8e91fae0..e741175e 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -8,6 +8,7 @@ import sys import logging import datetime +import contextlib from typing import Set, Dict, List, Optional from pathlib import Path @@ -154,6 +155,18 @@ def get_workspace(path: Path, input_format: str, sigpaths: List[Path]): viv_utils.flirt.register_flirt_signature_analyzers(vw, [str(s) for s in sigpaths]) + with contextlib.suppress(Exception): + # unfortuately viv raises a raw Exception (not any subclass). + # This happens when the module isn't found, such as with a viv upgrade. + # + # Remove the symbolic switch case solver. + # This is only enabled for ELF files, not PE files. + # During the following performance investigation, this analysis module + # had some terrible worst-case behavior. + # We can put up with slightly worse CFG reconstruction in order to avoid this. + # https://github.com/mandiant/capa/issues/1989#issuecomment-1948022767 + vw.delFuncAnalysisModule("vivisect.analysis.generic.symswitchcase") + vw.analyze() logger.debug("%s", get_meta_str(vw)) diff --git a/capa/version.py b/capa/version.py index 06bdc4ae..65fe77ff 100644 --- a/capa/version.py +++ b/capa/version.py @@ -5,7 +5,7 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. -__version__ = "7.0.1" +__version__ = "7.1.0" def get_major_version(): diff --git a/doc/installation.md b/doc/installation.md index 57c939c2..e5e7135e 100644 --- a/doc/installation.md +++ b/doc/installation.md @@ -91,6 +91,12 @@ For more details about creating and using virtual environments, check out the [v ##### Install development dependencies +When developing capa, please use the pinned dependencies found in `requirements.txt`. +This ensures that everyone has the exact same, reproducible environment. +Please install these dependencies before install capa (from source or from PyPI): + +`$ pip install -r requirements.txt` + We use the following tools to ensure consistent code style and formatting: - [black](https://github.com/psf/black) code formatter - [isort](https://pypi.org/project/isort/) code formatter @@ -101,7 +107,8 @@ We use the following tools to ensure consistent code style and formatting: To install these development dependencies, run: -`$ pip install -e /local/path/to/src[dev]` +- `$ pip install -e /local/path/to/src[dev]` or +- `$ pip install -e /local/path/to/src[dev,scripts]` to also install all script dependencies We use [pre-commit](https://pre-commit.com/) so that its trivial to run the same linters & configuration locally as in CI. diff --git a/pyproject.toml b/pyproject.toml index 714a567b..3afcb54d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,25 +32,76 @@ classifiers = [ "Topic :: Security", ] dependencies = [ - "tqdm==4.66.4", - "pyyaml==6.0.1", - "tabulate==0.9.0", - "colorama==0.4.6", - "termcolor==2.4.0", - "wcwidth==0.2.13", - "ida-settings==2.1.0", - "viv-utils[flirt]==0.7.9", - "networkx==3.1", - "ruamel.yaml==0.18.6", - "vivisect==1.1.1", - "pefile==2023.2.7", - "pyelftools==0.31", - "dnfile==0.14.1", - "dncil==1.0.2", - "pydantic==2.7.1", - "rich==13.7.1", - "humanize==4.9.0", - "protobuf==5.27.0", + # --------------------------------------- + # As a library, capa uses lower version bounds + # when specifying its dependencies. This lets + # other programs that use capa (and other libraries) + # to find a compatible set of dependency versions. + # + # We can optionally pin to specific versions or + # limit the upper bound when there's a good reason; + # but the default is to assume all greater versions + # probably work with capa until proven otherwise. + # + # The following link provides good background: + # https://iscinumpy.dev/post/bound-version-constraints/ + # + # When we develop capa, and when we distribute it as + # a standalone binary, we'll use specific versions + # that are pinned in requirements.txt. + # But the requirements for a library are specified here + # and are looser. + # + # Related discussions: + # + # - https://github.com/mandiant/capa/issues/2053 + # - https://github.com/mandiant/capa/pull/2059 + # - https://github.com/mandiant/capa/pull/2079 + # + # --------------------------------------- + # The following dependency versions were imported + # during June 2024 by truncating specific versions to + # their major-most version (major version when possible, + # or minor otherwise). + # As specific constraints are identified, please provide + # comments and context. + "tqdm>=4", + "pyyaml>=6", + "tabulate>=0.9", + "colorama>=0.4", + "termcolor>=2", + "wcwidth>=0.2", + "ida-settings>=2", + "ruamel.yaml>=0.18", + "pefile>=2023.2.7", + "pyelftools>=0.31", + "pydantic>=2", + "rich>=13", + "humanize>=4", + "protobuf>=5", + + # --------------------------------------- + # Dependencies that we develop + # + # These dependencies are often actively influenced by capa, + # so we provide a minimum patch version that includes the + # latest bug fixes we need here. + "viv-utils[flirt]>=0.7.9", + "vivisect>=1.1.1", + "dncil>=1.0.2", + + # --------------------------------------- + # Dependencies with version caps + # + # These dependencies must not exceed the version cap, + # typically due to dropping support for python releases + # we still support. + + # TODO(williballenthin): networkx 3.2 doesn't support python 3.8 while capa does. + # https://github.com/mandiant/capa/issues/1966 + "networkx>=3,<3.2", + + "dnfile>=0.15.0", ] dynamic = ["version"] @@ -63,6 +114,10 @@ namespaces = false [project.optional-dependencies] dev = [ + # Dev and build dependencies are not relaxed because + # we want all developer environments to be consistent. + # These dependencies are not used in production environments + # and should not conflict with other libraries/tooling. "pre-commit==3.5.0", "pytest==8.0.0", "pytest-sugar==1.0.0", @@ -79,14 +134,12 @@ dev = [ "flake8-simplify==0.21.0", "flake8-use-pathlib==0.3.0", "flake8-copyright==0.2.4", - "ruff==0.4.7", + "ruff==0.4.8", "black==24.4.2", "isort==5.13.2", "mypy==1.10.0", - "psutil==5.9.2", - "stix2==3.0.1", - "requests==2.31.0", "mypy-protobuf==3.6.0", + "PyGithub==2.3.0", # type stubs for mypy "types-backports==0.1.3", "types-colorama==0.4.15.11", @@ -99,10 +152,21 @@ dev = [ "deptry==0.16.1" ] build = [ - "pyinstaller==6.7.0", - "setuptools==69.5.1", + # Dev and build dependencies are not relaxed because + # we want all developer environments to be consistent. + # These dependencies are not used in production environments + # and should not conflict with other libraries/tooling. + "pyinstaller==6.8.0", + "setuptools==70.0.0", "build==1.2.1" ] +scripts = [ + "jschema_to_python==1.2.3", + "psutil==5.9.2", + "stix2==3.0.1", + "sarif_om==1.0.4", + "requests==2.31.0", +] [tool.deptry] extend_exclude = [ @@ -152,6 +216,7 @@ DEP002 = [ "mypy", "mypy-protobuf", "pre-commit", + "PyGithub", "pyinstaller", "pytest", "pytest-cov", @@ -175,6 +240,9 @@ DEP003 = [ "typing_extensions" # TODO(s-ff): remove when Python 3.9 is deprecated, see #1699 ] +[tool.deptry.package_module_name_map] +PyGithub = "github" + [project.urls] Homepage = "https://github.com/mandiant/capa" Repository = "https://github.com/mandiant/capa.git" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..b667e63a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,46 @@ +# Dependencies with specific version constraints +# used during development and building the standalone executables. +# For these environments, use `pip install -r requirements.txt` +# before installing capa from source/pypi. This will ensure +# the following specific versions are used. +# +# Initially generated via: pip freeze | grep -v -- "-e" +# Kept up to date by dependabot. +annotated-types==0.7.0 +colorama==0.4.6 +cxxfilt==0.2.2 +dncil==1.0.2 +dnfile==0.15.0 +funcy==2.0 +humanize==4.9.0 +ida-netnode==3.0 +ida-settings==2.1.0 +intervaltree==3.1.0 +markdown-it-py==3.0.0 +mdurl==0.1.2 +msgpack==1.0.8 +networkx==3.1 +pefile==2023.2.7 +pip==24.0 +protobuf==5.27.1 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycparser==2.22 +pydantic==2.7.3 +pydantic-core==2.18.4 +pyelftools==0.31 +pygments==2.18.0 +python-flirt==0.8.10 +pyyaml==6.0.1 +rich==13.7.1 +ruamel-yaml==0.18.6 +ruamel-yaml-clib==0.2.8 +setuptools==70.0.0 +six==1.16.0 +sortedcontainers==2.4.0 +tabulate==0.9.0 +termcolor==2.4.0 +tqdm==4.66.4 +viv-utils==0.7.11 +vivisect==1.1.1 +wcwidth==0.2.13 diff --git a/rules b/rules index 9e0ffdf7..e63c454f 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit 9e0ffdf7c51af31bb668c8ffbbe7c8f6fd9199cb +Subproject commit e63c454fbb9df14967a67479fee1e1615d54f4d6 diff --git a/scripts/capa2sarif.py b/scripts/capa2sarif.py new file mode 100644 index 00000000..63e0c6e2 --- /dev/null +++ b/scripts/capa2sarif.py @@ -0,0 +1,358 @@ +# Copyright (C) 2021 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +""" +Convert capa json output to sarif schema + usage: capa2sarif.py [-h] [-g] [-r] [-t TAG] [--version] capa_output + +Capa to SARIF analysis file +positional arguments: + capa_output Path to capa JSON output file +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit + -t TAG, --tag TAG filter on rule meta field values (ruleid) + +Requires: + - sarif_om 1.0.4 + - jschema_to_python 1.2.3 +""" +import sys +import json +import logging +import argparse +from typing import List, Optional +from pathlib import Path + +from capa.version import __version__ + +logger = logging.getLogger("capa2sarif") + +# Dependencies +try: + from sarif_om import Run, Tool, SarifLog, ToolComponent +except ImportError as e: + logger.error( + "Required import `sarif_om` is not installed. This is solved by installing `python3 -m pip install sarif_om>=1.0.4`. %s", + e, + ) + exit(-4) + +try: + from jschema_to_python.to_json import to_json +except ImportError as e: + logger.error( + "Required import `jschema_to_python` is not installed. This is solved by installing `python3 -m pip install jschema_to_python>=1.2.3`, %s", + e, + ) + exit(-4) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Capa to SARIF analysis file") + + # Positional argument + parser.add_argument("capa_output", help="Path to capa JSON output file") + + # Optional arguments + parser.add_argument( + "-g", + "--ghidra-compat", + action="store_true", + help="Compatibility for Ghidra 11.0.X", + ) + parser.add_argument( + "-r", + "--radare-compat", + action="store_true", + help="Compatibility for Radare r2sarif plugin v2.0", + ) + parser.add_argument("-t", "--tag", help="Filter on rule meta field values (ruleid)") + parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}") + + return parser.parse_args() + + +def main() -> int: + logging.basicConfig(level=logging.INFO) + logging.getLogger().setLevel(logging.INFO) + + args = _parse_args() + + try: + with Path(args.capa_output).open() as capa_output: + json_data = json.load(capa_output) + except ValueError: + logger.error("Input data was not valid JSON, input should be a capa json output file.") + return -1 + except json.JSONDecodeError: + # An exception has occured + logger.error("Input data was not valid JSON, input should be a capa json output file.") + return -2 + + # Marshall json into Sarif + # Create baseline sarif structure to be populated from json data + sarif_structure: Optional[dict] = _sarif_boilerplate(json_data["meta"], json_data["rules"]) + if sarif_structure is None: + logger.error("An Error has occured creating default sarif structure.") + return -3 + + _populate_artifact(sarif_structure, json_data["meta"]) + _populate_invocations(sarif_structure, json_data["meta"]) + _populate_results(sarif_structure, json_data["rules"], args.ghidra_compat) + + if args.ghidra_compat: + # Ghidra can't handle this structure as of 11.0.x + if "invocations" in sarif_structure["runs"][0]: + del sarif_structure["runs"][0]["invocations"] + + # artifacts must include a description as well with a text field. + if "artifacts" in sarif_structure["runs"][0]: + sarif_structure["runs"][0]["artifacts"][0]["description"] = {"text": "placeholder"} + + # For better compliance with Ghidra table. Iteraction through properties['additionalProperties'] + """ + "additionalProperties": { + "to": "", + "offset": 0, + "primary": true, + "index": <>"", + "kind": "", + "opIndex": 0, + "sourceType": "" + } + """ + + if args.radare_compat: + # Add just enough for passing tests + _add_filler_optional(json_data, sarif_structure) + + print(json.dumps(sarif_structure, indent=4)) # noqa: T201 + return 0 + + +def _sarif_boilerplate(data_meta: dict, data_rules: dict) -> Optional[dict]: + # Only track rules that appear in this log, not full 1k + rules = [] + # Parse rules from parsed sarif structure + for key in data_rules: + # Use attack as default, if both exist then only use attack, if neither exist use the name of rule for ruleID + # this is not good practice to use long name for ruleID + attack_length = len(data_rules[key]["meta"]["attack"]) + mbc_length = len(data_rules[key]["meta"]["mbc"]) + if attack_length or mbc_length: + id = ( + data_rules[key]["meta"]["attack"][0]["id"] + if attack_length > 0 + else data_rules[key]["meta"]["mbc"][0]["id"] + ) + else: + id = data_rules[key]["meta"]["name"] + + # Append current rule + rules.append( + { + # Default to attack identifier, fall back to MBC, mainly relevant if both are present + "id": id, + "name": data_rules[key]["meta"]["name"], + "shortDescription": {"text": data_rules[key]["meta"]["name"]}, + "messageStrings": {"default": {"text": data_rules[key]["meta"]["name"]}}, + "properties": { + "namespace": data_rules[key]["meta"]["namespace"] if "namespace" in data_rules[key]["meta"] else [], + "scopes": data_rules[key]["meta"]["scopes"], + "references": data_rules[key]["meta"]["references"], + "lib": data_rules[key]["meta"]["lib"], + }, + } + ) + + tool = Tool( + driver=ToolComponent( + name="Capa", + version=__version__, + information_uri="https://github.com/mandiant/capa", + rules=rules, + ) + ) + + # Create a SARIF Log object, populate with a single run + sarif_log = SarifLog( + version="2.1.0", + schema_uri="https://docs.oasis-open.org/sarif/sarif/v2.1.0/cos02/schemas/sarif-schema-2.1.0.json", + runs=[Run(tool=tool, results=[], artifacts=[], invocations=[])], + ) + + # Convert the SARIF log to a dictionary and then to a JSON string + try: + sarif_outline = json.loads(to_json(sarif_log)) + except json.JSONDecodeError: + # An exception has occured + return None + + return sarif_outline + + +def _populate_artifact(sarif_log: dict, meta_data: dict) -> None: + """ + @param sarif_log: dict - sarif data structure including runs + @param meta_data: dict - Capa meta output + @returns None, updates sarif_log via side-effects + """ + sample = meta_data["sample"] + artifact = { + "location": {"uri": sample["path"]}, + "roles": ["analysisTarget"], + "hashes": { + "md5": sample["md5"], + "sha-1": sample["sha1"], + "sha-256": sample["sha256"], + }, + } + sarif_log["runs"][0]["artifacts"].append(artifact) + + +def _populate_invocations(sarif_log: dict, meta_data: dict) -> None: + """ + @param sarif_log: dict - sarif data structure including runs + @param meta_data: dict - Capa meta output + @returns None, updates sarif_log via side-effects + """ + analysis_time = meta_data["timestamp"] + argv = meta_data["argv"] + analysis = meta_data["analysis"] + invoke = { + "commandLine": "capa " + " ".join(argv), + "arguments": argv if len(argv) > 0 else [], + # Format in Zulu time, this may require a conversion from local timezone + "endTimeUtc": f"{analysis_time}Z", + "executionSuccessful": True, + "properties": { + "format": analysis["format"], + "arch": analysis["arch"], + "os": analysis["os"], + "extractor": analysis["extractor"], + "rule_location": analysis["rules"], + "base_address": analysis["base_address"], + }, + } + sarif_log["runs"][0]["invocations"].append(invoke) + + +def _enumerate_evidence(node: dict, related_count: int) -> List[dict]: + related_locations = [] + if node.get("success") and node.get("node", {}).get("type") != "statement": + label = "" + if node.get("node", {}).get("type") == "feature": + if node.get("node", {}).get("feature", {}).get("type") == "api": + label = "api: " + node.get("node", {}).get("feature", {}).get("api") + elif node.get("node", {}).get("feature", {}).get("type") == "match": + label = "match: " + node.get("node", {}).get("feature", {}).get("match") + elif node.get("node", {}).get("feature", {}).get("type") == "number": + label = f"number: {node.get('node', {}).get('feature', {}).get('description')} ({node.get('node', {}).get('feature', {}).get('number')})" + elif node.get("node", {}).get("feature", {}).get("type") == "offset": + label = f"offset: {node.get('node', {}).get('feature', {}).get('description')} ({node.get('node', {}).get('feature', {}).get('offset')})" + elif node.get("node", {}).get("feature", {}).get("type") == "mnemonic": + label = f"mnemonic: {node.get('node', {}).get('feature', {}).get('mnemonic')}" + elif node.get("node", {}).get("feature", {}).get("type") == "characteristic": + label = f"characteristic: {node.get('node', {}).get('feature', {}).get('characteristic')}" + elif node.get("node", {}).get("feature", {}).get("type") == "os": + label = f"os: {node.get('node', {}).get('feature', {}).get('os')}" + elif node.get("node", {}).get("feature", {}).get("type") == "operand number": + label = f"operand: ({node.get('node', {}).get('feature', {}).get('index')} ) {node.get('node', {}).get('feature', {}).get('description')} ({node.get('node', {}).get('feature', {}).get('operand_number')})" + else: + logger.error( + "Not implemented %s", + node.get("node", {}).get("feature", {}).get("type"), + ) + return [] + else: + logger.error("Not implemented %s", node.get("node", {}).get("type")) + return [] + + for loc in node.get("locations", []): + if loc["type"] != "absolute": + continue + + related_locations.append( + { + "id": related_count, + "message": {"text": label}, + "physicalLocation": {"address": {"absoluteAddress": loc["value"]}}, + } + ) + related_count += 1 + + if node.get("success") and node.get("node", {}).get("type") == "statement": + for child in node.get("children", []): + related_locations += _enumerate_evidence(child, related_count) + + return related_locations + + +def _populate_results(sarif_log: dict, data_rules: dict, ghidra_compat: bool) -> None: + """ + @param sarif_log: dict - sarif data structure including runs + @param meta_data: dict - Capa meta output + @returns None, updates sarif_log via side-effects + """ + results = sarif_log["runs"][0]["results"] + + # Parse rules from parsed sarif structure + for key in data_rules: + # Use attack as default, if both exist then only use attack, if neither exist use the name of rule for ruleID + # this is not good practice to use long name for ruleID. + attack_length = len(data_rules[key]["meta"]["attack"]) + mbc_length = len(data_rules[key]["meta"]["mbc"]) + if attack_length or mbc_length: + id = ( + data_rules[key]["meta"]["attack"][0]["id"] + if attack_length > 0 + else data_rules[key]["meta"]["mbc"][0]["id"] + ) + else: + id = data_rules[key]["meta"]["name"] + + for address, details in data_rules[key]["matches"]: + related_cnt = 0 + related_locations = _enumerate_evidence(details, related_cnt) + + res = { + "ruleId": id, + "level": "none" if not ghidra_compat else "NONE", + "message": {"text": data_rules[key]["meta"]["name"]}, + "kind": "informational" if not ghidra_compat else "INFORMATIONAL", + "locations": [ + { + "physicalLocation": { + "address": { + "absoluteAddress": address["value"], + } + }, + } + ], + } + if not ghidra_compat: + res["relatedLocations"] = related_locations + + results.append(res) + + +def _add_filler_optional(capa_result: dict, sarif_log: dict) -> None: + """Update sarif file with just enough fields to pass radare tests""" + base_address = capa_result["meta"]["analysis"]["base_address"]["value"] + # Assume there is only one run, and one binary artifact + artifact = sarif_log["runs"][0]["artifacts"][0] + if "properties" not in artifact: + artifact["properties"] = {} + if "additionalProperties" not in artifact["properties"]: + artifact["properties"]["additionalProperties"] = {} + if "imageBase" not in artifact["properties"]["additionalProperties"]: + artifact["properties"]["additionalProperties"]["imageBase"] = base_address + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/data b/tests/data index 3f5f77f9..3a769017 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 3f5f77f945be736f6942683f232efa96b98b9876 +Subproject commit 3a7690178bcb05671bf4d33f3d117715272fe538 diff --git a/tests/fixtures.py b/tests/fixtures.py index 53104386..eae5bc25 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -389,6 +389,8 @@ def get_data_path_by_name(name) -> Path: return CD / "data" / "ea2876e9175410b6f6719f80ee44b9553960758c7d0f7bed73c0fe9a78d8e669.dll_" elif name.startswith("1038a2"): return CD / "data" / "1038a23daad86042c66bfe6c9d052d27048de9653bde5750dc0f240c792d9ac8.elf_" + elif name.startswith("3da7c"): + return CD / "data" / "3da7c2c70a2d93ac4643f20339d5c7d61388bddd77a4a5fd732311efad78e535.elf_" elif name.startswith("nested_typedef"): return CD / "data" / "dotnet" / "dd9098ff91717f4906afe9dafdfa2f52.exe_" elif name.startswith("nested_typeref"): diff --git a/tests/test_os_detection.py b/tests/test_os_detection.py index 0902ab55..34ced333 100644 --- a/tests/test_os_detection.py +++ b/tests/test_os_detection.py @@ -92,6 +92,12 @@ def test_elf_android_notes(): assert capa.features.extractors.elf.detect_elf_os(f) == "android" +def test_elf_go_buildinfo(): + path = get_data_path_by_name("3da7c") + with Path(path).open("rb") as f: + assert capa.features.extractors.elf.detect_elf_os(f) == "linux" + + def test_elf_parse_capa_pyinstaller_header(): # error after misidentified large pydata section with address 0; fixed in #1454 # compressed ELF header of capa-v5.1.0-linux diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 052b1c89..9bad3013 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -40,6 +40,10 @@ def get_rule_path(): [ pytest.param("capa2yara.py", [get_rules_path()]), pytest.param("capafmt.py", [get_rule_path()]), + pytest.param( + "capa2sarif.py", + [Path(__file__).resolve().parent / "data" / "rd" / "Practical Malware Analysis Lab 01-01.dll_.json"], + ), # testing some variations of linter script pytest.param("lint.py", ["-t", "create directory", get_rules_path()]), # `create directory` rule has native and .NET example PEs