mirror of
https://github.com/mandiant/capa.git
synced 2026-03-15 14:28:58 -07:00
Compare commits
88 Commits
bn-cache-i
...
codecut
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d89083ab31 | ||
|
|
891fa8aaa3 | ||
|
|
e94147b4c2 | ||
|
|
6fc4567f0c | ||
|
|
3b1a8f5b5a | ||
|
|
f296e7d423 | ||
|
|
3e02b67480 | ||
|
|
536526f61d | ||
|
|
bcd2c3fb35 | ||
|
|
f340b93a02 | ||
|
|
8bd6f8b99a | ||
|
|
8b4d5d3d22 | ||
|
|
bc6e18ed85 | ||
|
|
2426aba223 | ||
|
|
405e189267 | ||
|
|
cfb632edc8 | ||
|
|
70c96a29b4 | ||
|
|
c005de0a0d | ||
|
|
8d42b14b20 | ||
|
|
bad32b91fb | ||
|
|
9716da4765 | ||
|
|
e0784f2e85 | ||
|
|
4a775bab2e | ||
|
|
2de7830f5e | ||
|
|
9d67e133c9 | ||
|
|
fa18b4e201 | ||
|
|
c3c93685e2 | ||
|
|
462e11443e | ||
|
|
32d6181f02 | ||
|
|
6cf944b321 | ||
|
|
369fbc713e | ||
|
|
e3a1dbfac2 | ||
|
|
e5fe935a8e | ||
|
|
233f8dcf9f | ||
|
|
51d606bc0d | ||
|
|
2b46796d08 | ||
|
|
81f7f43b5b | ||
|
|
1f34795fce | ||
|
|
06f0012183 | ||
|
|
55720ddbfd | ||
|
|
893378c10e | ||
|
|
1a82b9d0c5 | ||
|
|
3cbc184020 | ||
|
|
347601a112 | ||
|
|
8a02b0773d | ||
|
|
f11661f8f2 | ||
|
|
518dc3381c | ||
|
|
5c60adaf96 | ||
|
|
4ab8d75629 | ||
|
|
51d852d1b3 | ||
|
|
aa8e4603d1 | ||
|
|
6c61a91778 | ||
|
|
e633e34517 | ||
|
|
9c72c9067b | ||
|
|
168435cf75 | ||
|
|
5fdf7e61e2 | ||
|
|
95fc747e6f | ||
|
|
1f374e4986 | ||
|
|
28c0234339 | ||
|
|
f57f909e68 | ||
|
|
02c359f79f | ||
|
|
4448d612f1 | ||
|
|
d7cf8d1251 | ||
|
|
d1f3e43325 | ||
|
|
83a46265df | ||
|
|
0c64bd4985 | ||
|
|
ed86e5fb1b | ||
|
|
e1c786466a | ||
|
|
959a234f0e | ||
|
|
e57de2beb4 | ||
|
|
9c9b3711c0 | ||
|
|
65e2dac4c4 | ||
|
|
9ad3f06e1d | ||
|
|
201ec07b58 | ||
|
|
c85be8dc72 | ||
|
|
54952feb07 | ||
|
|
379d6ef313 | ||
|
|
28fcd10d2e | ||
|
|
a6481df6c4 | ||
|
|
abe80842cb | ||
|
|
b6763ac5fe | ||
|
|
5a284de438 | ||
|
|
8cfccbcb44 | ||
|
|
01772d0de0 | ||
|
|
f0042157ab | ||
|
|
6a2330c11a | ||
|
|
02b5e11380 | ||
|
|
32c428b989 |
30
.github/workflows/web-release.yml
vendored
30
.github/workflows/web-release.yml
vendored
@@ -69,11 +69,35 @@ jobs:
|
||||
run: ls -t capa-explorer-web-v*.zip | tail -n +4 | xargs -r rm --
|
||||
working-directory: web/explorer/releases
|
||||
|
||||
- name: Commit and push release
|
||||
- name: Stage release files
|
||||
run: |
|
||||
git config --local user.email "capa-dev@mandiant.com"
|
||||
git config --local user.name "Capa Bot"
|
||||
git add -f web/explorer/releases/${{ env.RELEASE_NAME }}.zip web/explorer/releases/CHANGELOG.md
|
||||
git add -u web/explorer/releases/
|
||||
git commit -m ":robot: explorer web: add release ${{ env.RELEASE_NAME }}"
|
||||
git push
|
||||
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@5e914681df9dc83aa4e4905692ca88beb2f9e91f # v7.0.5
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
title: "explorer web: add release v${{ github.event.inputs.version }}"
|
||||
body: |
|
||||
This PR adds a new capa Explorer Web release v${{ github.event.inputs.version }}.
|
||||
|
||||
Release details:
|
||||
- Name: ${{ env.RELEASE_NAME }}
|
||||
- SHA256: ${{ env.RELEASE_SHA256 }}
|
||||
|
||||
This release is generated by the [web release](https://github.com/mandiant/capa/actions/workflows/web-release.yml) workflow.
|
||||
|
||||
- [x] No CHANGELOG update needed
|
||||
- [x] No new tests needed
|
||||
- [x] No documentation update needed
|
||||
commit-message: ":robot: explorer web: add release ${{ env.RELEASE_NAME }}"
|
||||
branch: release/web-v${{ github.event.inputs.version }}
|
||||
add-paths: web/explorer/releases/${{ env.RELEASE_NAME }}.zip
|
||||
base: master
|
||||
labels: webui
|
||||
delete-branch: true
|
||||
committer: Capa Bot <capa-dev@mandiant.com>
|
||||
author: Capa Bot <capa-dev@mandiant.com>
|
||||
|
||||
@@ -25,7 +25,7 @@ repos:
|
||||
hooks:
|
||||
- id: isort
|
||||
name: isort
|
||||
stages: [commit, push, manual]
|
||||
stages: [pre-commit, pre-push, manual]
|
||||
language: system
|
||||
entry: isort
|
||||
args:
|
||||
@@ -46,7 +46,7 @@ repos:
|
||||
hooks:
|
||||
- id: black
|
||||
name: black
|
||||
stages: [commit, push, manual]
|
||||
stages: [pre-commit, pre-push, manual]
|
||||
language: system
|
||||
entry: black
|
||||
args:
|
||||
@@ -64,7 +64,7 @@ repos:
|
||||
hooks:
|
||||
- id: ruff
|
||||
name: ruff
|
||||
stages: [commit, push, manual]
|
||||
stages: [pre-commit, pre-push, manual]
|
||||
language: system
|
||||
entry: ruff
|
||||
args:
|
||||
@@ -82,7 +82,7 @@ repos:
|
||||
hooks:
|
||||
- id: flake8
|
||||
name: flake8
|
||||
stages: [push, manual]
|
||||
stages: [pre-push, manual]
|
||||
language: system
|
||||
entry: flake8
|
||||
args:
|
||||
@@ -101,13 +101,14 @@ repos:
|
||||
hooks:
|
||||
- id: mypy
|
||||
name: mypy
|
||||
stages: [push, manual]
|
||||
stages: [pre-push, manual]
|
||||
language: system
|
||||
entry: mypy
|
||||
args:
|
||||
- "--check-untyped-defs"
|
||||
- "--ignore-missing-imports"
|
||||
- "--config-file=.github/mypy/mypy.ini"
|
||||
- "--enable-incomplete-feature=NewGenericSyntax"
|
||||
- "capa/"
|
||||
- "scripts/"
|
||||
- "tests/"
|
||||
@@ -119,7 +120,7 @@ repos:
|
||||
hooks:
|
||||
- id: deptry
|
||||
name: deptry
|
||||
stages: [push, manual]
|
||||
stages: [pre-push, manual]
|
||||
language: system
|
||||
entry: deptry .
|
||||
always_run: true
|
||||
|
||||
108
CHANGELOG.md
108
CHANGELOG.md
@@ -4,13 +4,61 @@
|
||||
|
||||
### New Features
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
### New Rules (0)
|
||||
|
||||
-
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- vmray: load more analysis archives @mr-tz
|
||||
- dynamic: only check file limitations for static file formats @mr-tz
|
||||
- vmray: skip non-printable strings @mike-hunhoff
|
||||
|
||||
### capa Explorer Web
|
||||
|
||||
### capa Explorer IDA Pro plugin
|
||||
|
||||
### Development
|
||||
|
||||
### Raw diffs
|
||||
- [capa v8.0.1...master](https://github.com/mandiant/capa/compare/v8.0.1...master)
|
||||
- [capa-rules v8.0.1...master](https://github.com/mandiant/capa-rules/compare/v8.0.1...master)
|
||||
|
||||
## v8.0.1
|
||||
|
||||
This point release fixes an issue with the IDAPython API to now handle IDA Pro 8.3, 8.4, and 9.0 correctly.
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- handle IDA 8.3/8.4 vs. 9.0 API change @mr-tz
|
||||
|
||||
### Raw diffs
|
||||
- [capa v8.0.0...v8.0.1](https://github.com/mandiant/capa/compare/v8.0.0...v8.0.1)
|
||||
- [capa-rules v8.0.0...v8.0.1](https://github.com/mandiant/capa-rules/compare/v8.0.0...v8.0.1)
|
||||
|
||||
## v8.0.0
|
||||
|
||||
capa version 8 adds support for IDA Pro 9.0 (and idalib). The release comes with various improvements and bug fixes for the Binary Ninja backend (including to load with database files) -- thanks to @xusheng6.
|
||||
|
||||
Additional bug fixes improve the dynamic and BinExport backends.
|
||||
|
||||
capa version 8 now requires Python 3.10 or newer.
|
||||
|
||||
Special thanks to @Tamir-K, @harshit-wadhwani, @jorik-utwente for their great contributions.
|
||||
|
||||
### New Features
|
||||
|
||||
- allow call as valid subscope for call scoped rules @mr-tz
|
||||
- support loading and analyzing a Binary Ninja database #2496 @xusheng6
|
||||
- vmray: record process command line details @mr-tz
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
- remove support for Python 3.8 and use Python 3.10 as minimum now #1966 @mr-tz
|
||||
|
||||
### New Rules (8)
|
||||
### New Rules (54)
|
||||
|
||||
- nursery/get-shadow-password-file-entry-on-linux jonathanlepore@google.com
|
||||
- nursery/set-shadow-password-file-entry-on-linux jonathanlepore@google.com
|
||||
@@ -20,7 +68,52 @@
|
||||
- nursery/persist-via-application-shimming j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-bits-job j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-print-processors-registry-key j.j.vannielen@utwente.nl
|
||||
-
|
||||
- linking/static/touchsocket/linked-against-touchsocket still@teamt5.org
|
||||
- runtime/dotnet/compiled-with-dotnet-aot still@teamt5.org
|
||||
- nursery/persist-via-errorhandler-script j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-get-variable-hijack j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-iphlpapi-dll-hijack j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-lnk-shortcut j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-powershell-profile j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-windows-accessibility-tools j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-windows-terminal-profile j.j.vannielen@utwente.nl
|
||||
- nursery/write-to-browser-extension-directory j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-aedebug-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-amsi-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-app-paths-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-appcertdlls-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-appx-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-autodialdll-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-autoplayhandlers-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-bootverificationprogram-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-code-signing-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-com-hijack j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-command-processor-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-contextmenuhandlers-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-cor_profiler_path-registry-value j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-default-file-association-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-disk-cleanup-handler-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-dotnet-dbgmanageddebugger-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-dotnet_startup_hooks-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-explorer-tools-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-filter-handlers-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-group-policy-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-hhctrl-com-hijack j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-htmlhelp-author-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-image-file-execution-options-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-lsa-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-natural-language-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-netsh-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-network-provider-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-path-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-print-monitors-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-rdp-startup-programs-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-silentprocessexit-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-telemetrycontroller-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-timeproviders-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-ts-initialprogram-registry-key j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-userinitmprlogonscript-registry-value j.j.vannielen@utwente.nl
|
||||
- nursery/persist-via-windows-error-reporting-registry-key j.j.vannielen@utwente.nl
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
@@ -29,6 +122,13 @@
|
||||
- ghidra: fix saving of base address @mr-tz
|
||||
- binja: support loading raw x86/x86_64 shellcode #2489 @xusheng6
|
||||
- binja: fix crash when the IL of certain functions are not available. #2249 @xusheng6
|
||||
- binja: major performance improvement on the binja extractor. #1414 @xusheng6
|
||||
- cape: make Process model flexible and procmemory optional to load newest reports #2466 @mr-tz
|
||||
- binja: fix unit test failure by fixing up the analysis for file al-khaser_x64.exe_ #2507 @xusheng6
|
||||
- binja: move the stack string detection to function level #2516 @xusheng6
|
||||
- BinExport2: fix handling of incorrect thunk functions #2524 @williballenthin
|
||||
- BinExport2: more precise pruning of expressions @williballenthin
|
||||
- BinExport2: better handle weird expression trees from Ghidra #2528 #2530 @williballenthin
|
||||
|
||||
### capa Explorer Web
|
||||
|
||||
@@ -42,8 +142,8 @@
|
||||
- CI: update Binary Ninja version to 4.2 #2499 @xusheng6
|
||||
|
||||
### Raw diffs
|
||||
- [capa v7.4.0...master](https://github.com/mandiant/capa/compare/v7.4.0...master)
|
||||
- [capa-rules v7.4.0...master](https://github.com/mandiant/capa-rules/compare/v7.4.0...master)
|
||||
- [capa v7.4.0...v8.0.0](https://github.com/mandiant/capa/compare/v7.4.0...v8.0.0)
|
||||
- [capa-rules v7.4.0...v8.0.0](https://github.com/mandiant/capa-rules/compare/v7.4.0...v8.0.0)
|
||||
|
||||
## v7.4.0
|
||||
|
||||
|
||||
82
README.md
82
README.md
@@ -38,49 +38,47 @@ Below you find a list of [our capa blog posts with more details.](#blog-posts)
|
||||
```
|
||||
$ capa.exe suspicious.exe
|
||||
|
||||
+------------------------+--------------------------------------------------------------------------------+
|
||||
| ATT&CK Tactic | ATT&CK Technique |
|
||||
|------------------------+--------------------------------------------------------------------------------|
|
||||
| DEFENSE EVASION | Obfuscated Files or Information [T1027] |
|
||||
| DISCOVERY | Query Registry [T1012] |
|
||||
| | System Information Discovery [T1082] |
|
||||
| EXECUTION | Command and Scripting Interpreter::Windows Command Shell [T1059.003] |
|
||||
| | Shared Modules [T1129] |
|
||||
| EXFILTRATION | Exfiltration Over C2 Channel [T1041] |
|
||||
| PERSISTENCE | Create or Modify System Process::Windows Service [T1543.003] |
|
||||
+------------------------+--------------------------------------------------------------------------------+
|
||||
+--------------------+------------------------------------------------------------------------+
|
||||
| ATT&CK Tactic | ATT&CK Technique |
|
||||
|--------------------+------------------------------------------------------------------------|
|
||||
| DEFENSE EVASION | Obfuscated Files or Information [T1027] |
|
||||
| DISCOVERY | Query Registry [T1012] |
|
||||
| | System Information Discovery [T1082] |
|
||||
| EXECUTION | Command and Scripting Interpreter::Windows Command Shell [T1059.003] |
|
||||
| | Shared Modules [T1129] |
|
||||
| EXFILTRATION | Exfiltration Over C2 Channel [T1041] |
|
||||
| PERSISTENCE | Create or Modify System Process::Windows Service [T1543.003] |
|
||||
+--------------------+------------------------------------------------------------------------+
|
||||
|
||||
+-------------------------------------------------------+-------------------------------------------------+
|
||||
| CAPABILITY | NAMESPACE |
|
||||
|-------------------------------------------------------+-------------------------------------------------|
|
||||
| check for OutputDebugString error | anti-analysis/anti-debugging/debugger-detection |
|
||||
| read and send data from client to server | c2/file-transfer |
|
||||
| execute shell command and capture output | c2/shell |
|
||||
| receive data (2 matches) | communication |
|
||||
| send data (6 matches) | communication |
|
||||
| connect to HTTP server (3 matches) | communication/http/client |
|
||||
| send HTTP request (3 matches) | communication/http/client |
|
||||
| create pipe | communication/named-pipe/create |
|
||||
| get socket status (2 matches) | communication/socket |
|
||||
| receive data on socket (2 matches) | communication/socket/receive |
|
||||
| send data on socket (3 matches) | communication/socket/send |
|
||||
| connect TCP socket | communication/socket/tcp |
|
||||
| encode data using Base64 | data-manipulation/encoding/base64 |
|
||||
| encode data using XOR (6 matches) | data-manipulation/encoding/xor |
|
||||
| run as a service | executable/pe |
|
||||
| get common file path (3 matches) | host-interaction/file-system |
|
||||
| read file | host-interaction/file-system/read |
|
||||
| write file (2 matches) | host-interaction/file-system/write |
|
||||
| print debug messages (2 matches) | host-interaction/log/debug/write-event |
|
||||
| resolve DNS | host-interaction/network/dns/resolve |
|
||||
| get hostname | host-interaction/os/hostname |
|
||||
| create a process with modified I/O handles and window | host-interaction/process/create |
|
||||
| create process | host-interaction/process/create |
|
||||
| create registry key | host-interaction/registry/create |
|
||||
| create service | host-interaction/service/create |
|
||||
| create thread | host-interaction/thread/create |
|
||||
| persist via Windows service | persistence/service |
|
||||
+-------------------------------------------------------+-------------------------------------------------+
|
||||
+-------------------------------------------+-------------------------------------------------+
|
||||
| CAPABILITY | NAMESPACE |
|
||||
|-------------------------------------------+-------------------------------------------------|
|
||||
| read and send data from client to server | c2/file-transfer |
|
||||
| execute shell command and capture output | c2/shell |
|
||||
| receive data (2 matches) | communication |
|
||||
| send data (6 matches) | communication |
|
||||
| connect to HTTP server (3 matches) | communication/http/client |
|
||||
| send HTTP request (3 matches) | communication/http/client |
|
||||
| create pipe | communication/named-pipe/create |
|
||||
| get socket status (2 matches) | communication/socket |
|
||||
| receive data on socket (2 matches) | communication/socket/receive |
|
||||
| send data on socket (3 matches) | communication/socket/send |
|
||||
| connect TCP socket | communication/socket/tcp |
|
||||
| encode data using Base64 | data-manipulation/encoding/base64 |
|
||||
| encode data using XOR (6 matches) | data-manipulation/encoding/xor |
|
||||
| run as a service | executable/pe |
|
||||
| get common file path (3 matches) | host-interaction/file-system |
|
||||
| read file | host-interaction/file-system/read |
|
||||
| write file (2 matches) | host-interaction/file-system/write |
|
||||
| print debug messages (2 matches) | host-interaction/log/debug/write-event |
|
||||
| resolve DNS | host-interaction/network/dns/resolve |
|
||||
| get hostname | host-interaction/os/hostname |
|
||||
| create process | host-interaction/process/create |
|
||||
| create registry key | host-interaction/registry/create |
|
||||
| create service | host-interaction/service/create |
|
||||
| create thread | host-interaction/thread/create |
|
||||
| persist via Windows service | persistence/service |
|
||||
+-------------------------------------------+-------------------------------------------------+
|
||||
```
|
||||
|
||||
# download and usage
|
||||
|
||||
0
capa/analysis/__init__.py
Normal file
0
capa/analysis/__init__.py
Normal file
38
capa/analysis/flirt.py
Normal file
38
capa/analysis/flirt.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at: [package root]/LICENSE.txt
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
import capa.features.extractors.ida.idalib as idalib
|
||||
|
||||
if not idalib.has_idalib():
|
||||
raise RuntimeError("cannot find IDA idalib module.")
|
||||
|
||||
if not idalib.load_idalib():
|
||||
raise RuntimeError("failed to load IDA idalib module.")
|
||||
|
||||
import idaapi
|
||||
import idautils
|
||||
|
||||
|
||||
class FunctionId(BaseModel):
|
||||
va: int
|
||||
is_library: bool
|
||||
name: str
|
||||
|
||||
|
||||
def get_flirt_matches(lib_only=True):
|
||||
for fva in idautils.Functions():
|
||||
f = idaapi.get_func(fva)
|
||||
is_lib = bool(f.flags & idaapi.FUNC_LIB)
|
||||
fname = idaapi.get_func_name(fva)
|
||||
|
||||
if lib_only and not is_lib:
|
||||
continue
|
||||
|
||||
yield FunctionId(va=fva, is_library=is_lib, name=fname)
|
||||
242
capa/analysis/libraries.py
Normal file
242
capa/analysis/libraries.py
Normal file
@@ -0,0 +1,242 @@
|
||||
# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at: [package root]/LICENSE.txt
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
import io
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import tempfile
|
||||
import contextlib
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
import rich
|
||||
from pydantic import BaseModel
|
||||
from rich.text import Text
|
||||
from rich.console import Console
|
||||
|
||||
import capa.main
|
||||
import capa.helpers
|
||||
import capa.analysis.flirt
|
||||
import capa.analysis.strings
|
||||
import capa.features.extractors.ida.idalib as idalib
|
||||
|
||||
if not idalib.has_idalib():
|
||||
raise RuntimeError("cannot find IDA idalib module.")
|
||||
|
||||
if not idalib.load_idalib():
|
||||
raise RuntimeError("failed to load IDA idalib module.")
|
||||
|
||||
import idaapi
|
||||
import idapro
|
||||
import ida_auto
|
||||
import idautils
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Classification(str, Enum):
|
||||
USER = "user"
|
||||
LIBRARY = "library"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
class Method(str, Enum):
|
||||
FLIRT = "flirt"
|
||||
STRINGS = "strings"
|
||||
THUNK = "thunk"
|
||||
ENTRYPOINT = "entrypoint"
|
||||
|
||||
|
||||
class FunctionClassification(BaseModel):
|
||||
va: int
|
||||
classification: Classification
|
||||
# name per the disassembler/analysis tool
|
||||
# may be combined with the recovered/suspected name TODO below
|
||||
name: str
|
||||
|
||||
# if is library, this must be provided
|
||||
method: Optional[Method]
|
||||
|
||||
# TODO if is library, recovered/suspected name?
|
||||
|
||||
# if is library, these can optionally be provided.
|
||||
library_name: Optional[str] = None
|
||||
library_version: Optional[str] = None
|
||||
|
||||
|
||||
class FunctionIdResults(BaseModel):
|
||||
function_classifications: List[FunctionClassification]
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def ida_session(input_path: Path, use_temp_dir=True):
|
||||
if use_temp_dir:
|
||||
t = Path(tempfile.mkdtemp(prefix="ida-")) / input_path.name
|
||||
else:
|
||||
t = input_path
|
||||
|
||||
logger.debug("using %s", str(t))
|
||||
# stderr=True is used here to redirect the spinner banner to stderr,
|
||||
# so that users can redirect capa's output.
|
||||
console = Console(stderr=True, quiet=False)
|
||||
|
||||
try:
|
||||
if use_temp_dir:
|
||||
t.write_bytes(input_path.read_bytes())
|
||||
|
||||
# idalib writes to stdout (ugh), so we have to capture that
|
||||
# so as not to screw up structured output.
|
||||
with capa.helpers.stdout_redirector(io.BytesIO()):
|
||||
idapro.enable_console_messages(False)
|
||||
with capa.main.timing("analyze program"):
|
||||
with console.status("analyzing program...", spinner="dots"):
|
||||
if idapro.open_database(str(t.absolute()), run_auto_analysis=True):
|
||||
raise RuntimeError("failed to analyze input file")
|
||||
|
||||
logger.debug("idalib: waiting for analysis...")
|
||||
ida_auto.auto_wait()
|
||||
logger.debug("idalib: opened database.")
|
||||
|
||||
yield
|
||||
finally:
|
||||
idapro.close_database()
|
||||
if use_temp_dir:
|
||||
t.unlink()
|
||||
|
||||
|
||||
def is_thunk_function(fva):
|
||||
f = idaapi.get_func(fva)
|
||||
return bool(f.flags & idaapi.FUNC_THUNK)
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
if argv is None:
|
||||
argv = sys.argv[1:]
|
||||
|
||||
parser = argparse.ArgumentParser(description="Identify library functions using various strategies.")
|
||||
capa.main.install_common_args(parser, wanted={"input_file"})
|
||||
parser.add_argument("--store-idb", action="store_true", default=False, help="store IDA database file")
|
||||
parser.add_argument("--min-string-length", type=int, default=8, help="minimum string length")
|
||||
parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text")
|
||||
args = parser.parse_args(args=argv)
|
||||
|
||||
try:
|
||||
capa.main.handle_common_args(args)
|
||||
except capa.main.ShouldExitError as e:
|
||||
return e.status_code
|
||||
|
||||
dbs = capa.analysis.strings.get_default_databases()
|
||||
capa.analysis.strings.prune_databases(dbs, n=args.min_string_length)
|
||||
|
||||
function_classifications: List[FunctionClassification] = []
|
||||
with ida_session(args.input_file, use_temp_dir=not args.store_idb):
|
||||
with capa.main.timing("FLIRT-based library identification"):
|
||||
# TODO: add more signature (files)
|
||||
# TOOD: apply more signatures
|
||||
for flirt_match in capa.analysis.flirt.get_flirt_matches():
|
||||
function_classifications.append(
|
||||
FunctionClassification(
|
||||
va=flirt_match.va,
|
||||
name=flirt_match.name,
|
||||
classification=Classification.LIBRARY,
|
||||
method=Method.FLIRT,
|
||||
# note: we cannot currently include which signature matched per function via the IDA API
|
||||
)
|
||||
)
|
||||
|
||||
# thunks
|
||||
for fva in idautils.Functions():
|
||||
if is_thunk_function(fva):
|
||||
function_classifications.append(
|
||||
FunctionClassification(
|
||||
va=fva,
|
||||
name=idaapi.get_func_name(fva),
|
||||
classification=Classification.LIBRARY,
|
||||
method=Method.THUNK,
|
||||
)
|
||||
)
|
||||
|
||||
with capa.main.timing("string-based library identification"):
|
||||
for string_match in capa.analysis.strings.get_string_matches(dbs):
|
||||
function_classifications.append(
|
||||
FunctionClassification(
|
||||
va=string_match.va,
|
||||
name=idaapi.get_func_name(string_match.va),
|
||||
classification=Classification.LIBRARY,
|
||||
method=Method.STRINGS,
|
||||
library_name=string_match.metadata.library_name,
|
||||
library_version=string_match.metadata.library_version,
|
||||
)
|
||||
)
|
||||
|
||||
for va in idautils.Functions():
|
||||
name = idaapi.get_func_name(va)
|
||||
if name not in {
|
||||
"WinMain",
|
||||
}:
|
||||
continue
|
||||
|
||||
function_classifications.append(
|
||||
FunctionClassification(
|
||||
va=va,
|
||||
name=name,
|
||||
classification=Classification.USER,
|
||||
method=Method.ENTRYPOINT,
|
||||
)
|
||||
)
|
||||
|
||||
doc = FunctionIdResults(function_classifications=[])
|
||||
classifications_by_va = capa.analysis.strings.create_index(function_classifications, "va")
|
||||
for va in idautils.Functions():
|
||||
if classifications := classifications_by_va.get(va):
|
||||
doc.function_classifications.extend(classifications)
|
||||
else:
|
||||
doc.function_classifications.append(
|
||||
FunctionClassification(
|
||||
va=va,
|
||||
name=idaapi.get_func_name(va),
|
||||
classification=Classification.UNKNOWN,
|
||||
method=None,
|
||||
)
|
||||
)
|
||||
|
||||
if args.json:
|
||||
print(doc.model_dump_json()) # noqa: T201 print found
|
||||
|
||||
else:
|
||||
table = rich.table.Table()
|
||||
table.add_column("FVA")
|
||||
table.add_column("CLASSIFICATION")
|
||||
table.add_column("METHOD")
|
||||
table.add_column("FNAME")
|
||||
table.add_column("EXTRA INFO")
|
||||
|
||||
classifications_by_va = capa.analysis.strings.create_index(doc.function_classifications, "va", sorted_=True)
|
||||
for va, classifications in classifications_by_va.items():
|
||||
name = ", ".join({c.name for c in classifications})
|
||||
if "sub_" in name:
|
||||
name = Text(name, style="grey53")
|
||||
|
||||
classification = {c.classification for c in classifications}
|
||||
method = {c.method for c in classifications if c.method}
|
||||
extra = {f"{c.library_name}@{c.library_version}" for c in classifications if c.library_name}
|
||||
|
||||
table.add_row(
|
||||
hex(va),
|
||||
", ".join(classification) if classification != {"unknown"} else Text("unknown", style="grey53"),
|
||||
", ".join(method),
|
||||
name,
|
||||
", ".join(extra),
|
||||
)
|
||||
|
||||
rich.print(table)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
2
capa/analysis/requirements.txt
Normal file
2
capa/analysis/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
# temporary extra file to track dependencies of the analysis directory
|
||||
nltk==3.9.1
|
||||
269
capa/analysis/strings/__init__.py
Normal file
269
capa/analysis/strings/__init__.py
Normal file
@@ -0,0 +1,269 @@
|
||||
# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at: [package root]/LICENSE.txt
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
|
||||
"""
|
||||
further requirements:
|
||||
- nltk
|
||||
"""
|
||||
import gzip
|
||||
import logging
|
||||
import collections
|
||||
from typing import Any, Dict, Mapping
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
import msgspec
|
||||
|
||||
import capa.features.extractors.strings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LibraryString(msgspec.Struct):
|
||||
string: str
|
||||
library_name: str
|
||||
library_version: str
|
||||
file_path: str | None = None
|
||||
function_name: str | None = None
|
||||
line_number: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class LibraryStringDatabase:
|
||||
metadata_by_string: Dict[str, LibraryString]
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.metadata_by_string)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, path: Path) -> "LibraryStringDatabase":
|
||||
metadata_by_string: Dict[str, LibraryString] = {}
|
||||
decoder = msgspec.json.Decoder(type=LibraryString)
|
||||
for line in gzip.decompress(path.read_bytes()).split(b"\n"):
|
||||
if not line:
|
||||
continue
|
||||
s = decoder.decode(line)
|
||||
metadata_by_string[s.string] = s
|
||||
|
||||
return cls(metadata_by_string=metadata_by_string)
|
||||
|
||||
|
||||
DEFAULT_FILENAMES = (
|
||||
"brotli.jsonl.gz",
|
||||
"bzip2.jsonl.gz",
|
||||
"cryptopp.jsonl.gz",
|
||||
"curl.jsonl.gz",
|
||||
"detours.jsonl.gz",
|
||||
"jemalloc.jsonl.gz",
|
||||
"jsoncpp.jsonl.gz",
|
||||
"kcp.jsonl.gz",
|
||||
"liblzma.jsonl.gz",
|
||||
"libsodium.jsonl.gz",
|
||||
"libpcap.jsonl.gz",
|
||||
"mbedtls.jsonl.gz",
|
||||
"openssl.jsonl.gz",
|
||||
"sqlite3.jsonl.gz",
|
||||
"tomcrypt.jsonl.gz",
|
||||
"wolfssl.jsonl.gz",
|
||||
"zlib.jsonl.gz",
|
||||
)
|
||||
|
||||
DEFAULT_PATHS = tuple(Path(__file__).parent / "data" / "oss" / filename for filename in DEFAULT_FILENAMES) + (
|
||||
Path(__file__).parent / "data" / "crt" / "msvc_v143.jsonl.gz",
|
||||
)
|
||||
|
||||
|
||||
def get_default_databases() -> list[LibraryStringDatabase]:
|
||||
return [LibraryStringDatabase.from_file(path) for path in DEFAULT_PATHS]
|
||||
|
||||
|
||||
@dataclass
|
||||
class WindowsApiStringDatabase:
|
||||
dll_names: set[str]
|
||||
api_names: set[str]
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.dll_names) + len(self.api_names)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, path: Path) -> "WindowsApiStringDatabase":
|
||||
dll_names: set[str] = set()
|
||||
api_names: set[str] = set()
|
||||
|
||||
for line in gzip.decompress((path / "dlls.txt.gz").read_bytes()).decode("utf-8").splitlines():
|
||||
if not line:
|
||||
continue
|
||||
dll_names.add(line)
|
||||
|
||||
for line in gzip.decompress((path / "apis.txt.gz").read_bytes()).decode("utf-8").splitlines():
|
||||
if not line:
|
||||
continue
|
||||
api_names.add(line)
|
||||
|
||||
return cls(dll_names=dll_names, api_names=api_names)
|
||||
|
||||
@classmethod
|
||||
def from_defaults(cls) -> "WindowsApiStringDatabase":
|
||||
return cls.from_dir(Path(__file__).parent / "data" / "winapi")
|
||||
|
||||
|
||||
def extract_strings(buf, n=4):
|
||||
yield from capa.features.extractors.strings.extract_ascii_strings(buf, n=n)
|
||||
yield from capa.features.extractors.strings.extract_unicode_strings(buf, n=n)
|
||||
|
||||
|
||||
def prune_databases(dbs: list[LibraryStringDatabase], n=8):
|
||||
"""remove less trustyworthy database entries.
|
||||
|
||||
such as:
|
||||
- those found in multiple databases
|
||||
- those that are English words
|
||||
- those that are too short
|
||||
- Windows API and DLL names
|
||||
"""
|
||||
|
||||
# TODO: consider applying these filters directly to the persisted databases, not at load time.
|
||||
|
||||
winapi = WindowsApiStringDatabase.from_defaults()
|
||||
|
||||
try:
|
||||
from nltk.corpus import words as nltk_words
|
||||
|
||||
nltk_words.words()
|
||||
except (ImportError, LookupError):
|
||||
# one-time download of dataset.
|
||||
# this probably doesn't work well for embedded use.
|
||||
import nltk
|
||||
|
||||
nltk.download("words")
|
||||
from nltk.corpus import words as nltk_words
|
||||
words = set(nltk_words.words())
|
||||
|
||||
counter: collections.Counter[str] = collections.Counter()
|
||||
to_remove = set()
|
||||
for db in dbs:
|
||||
for string in db.metadata_by_string.keys():
|
||||
counter[string] += 1
|
||||
|
||||
if string in words:
|
||||
to_remove.add(string)
|
||||
continue
|
||||
|
||||
if len(string) < n:
|
||||
to_remove.add(string)
|
||||
continue
|
||||
|
||||
if string in winapi.api_names:
|
||||
to_remove.add(string)
|
||||
continue
|
||||
|
||||
if string in winapi.dll_names:
|
||||
to_remove.add(string)
|
||||
continue
|
||||
|
||||
for string, count in counter.most_common():
|
||||
if count <= 1:
|
||||
break
|
||||
|
||||
# remove strings that are seen in more than one database
|
||||
to_remove.add(string)
|
||||
|
||||
for db in dbs:
|
||||
for string in to_remove:
|
||||
if string in db.metadata_by_string:
|
||||
del db.metadata_by_string[string]
|
||||
|
||||
|
||||
def get_function_strings():
|
||||
import idaapi
|
||||
import idautils
|
||||
|
||||
import capa.features.extractors.ida.helpers as ida_helpers
|
||||
|
||||
strings_by_function = collections.defaultdict(set)
|
||||
for ea in idautils.Functions():
|
||||
f = idaapi.get_func(ea)
|
||||
|
||||
# ignore library functions and thunk functions as identified by IDA
|
||||
if f.flags & idaapi.FUNC_THUNK:
|
||||
continue
|
||||
if f.flags & idaapi.FUNC_LIB:
|
||||
continue
|
||||
|
||||
for bb in ida_helpers.get_function_blocks(f):
|
||||
for insn in ida_helpers.get_instructions_in_range(bb.start_ea, bb.end_ea):
|
||||
ref = capa.features.extractors.ida.helpers.find_data_reference_from_insn(insn)
|
||||
if ref == insn.ea:
|
||||
continue
|
||||
|
||||
string = capa.features.extractors.ida.helpers.find_string_at(ref)
|
||||
if not string:
|
||||
continue
|
||||
|
||||
strings_by_function[ea].add(string)
|
||||
|
||||
return strings_by_function
|
||||
|
||||
|
||||
@dataclass
|
||||
class LibraryStringClassification:
|
||||
va: int
|
||||
string: str
|
||||
library_name: str
|
||||
metadata: LibraryString
|
||||
|
||||
|
||||
def create_index(s: list, k: str, sorted_: bool = False) -> Mapping[Any, list]:
|
||||
"""create an index of the elements in `s` using the key `k`, optionally sorted by `k`"""
|
||||
if sorted_:
|
||||
s = sorted(s, key=lambda x: getattr(x, k))
|
||||
|
||||
s_by_k = collections.defaultdict(list)
|
||||
for v in s:
|
||||
p = getattr(v, k)
|
||||
s_by_k[p].append(v)
|
||||
return s_by_k
|
||||
|
||||
|
||||
def get_string_matches(dbs: list[LibraryStringDatabase]) -> list[LibraryStringClassification]:
|
||||
matches: list[LibraryStringClassification] = []
|
||||
|
||||
for function, strings in sorted(get_function_strings().items()):
|
||||
for string in strings:
|
||||
for db in dbs:
|
||||
if metadata := db.metadata_by_string.get(string):
|
||||
matches.append(
|
||||
LibraryStringClassification(
|
||||
va=function,
|
||||
string=string,
|
||||
library_name=metadata.library_name,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
# if there are less than N strings per library, ignore that library
|
||||
matches_by_library = create_index(matches, "library_name")
|
||||
for library_name, library_matches in matches_by_library.items():
|
||||
if len(library_matches) > 5:
|
||||
continue
|
||||
|
||||
logger.info("pruning library %s: only %d matched string", library_name, len(library_matches))
|
||||
matches = [m for m in matches if m.library_name != library_name]
|
||||
|
||||
# if there are conflicts within a single function, don't label it
|
||||
matches_by_function = create_index(matches, "va")
|
||||
for va, function_matches in matches_by_function.items():
|
||||
library_names = {m.library_name for m in function_matches}
|
||||
if len(library_names) == 1:
|
||||
continue
|
||||
|
||||
logger.info("conflicting matches: 0x%x: %s", va, sorted(library_names))
|
||||
# this is potentially slow (O(n**2)) but hopefully fast enough in practice.
|
||||
matches = [m for m in matches if m.va != va]
|
||||
|
||||
return matches
|
||||
130
capa/analysis/strings/__main__.py
Normal file
130
capa/analysis/strings/__main__.py
Normal file
@@ -0,0 +1,130 @@
|
||||
# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at: [package root]/LICENSE.txt
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
import sys
|
||||
import logging
|
||||
import collections
|
||||
from pathlib import Path
|
||||
|
||||
import rich
|
||||
from rich.text import Text
|
||||
|
||||
import capa.analysis.strings
|
||||
import capa.features.extractors.strings
|
||||
import capa.features.extractors.ida.helpers as ida_helpers
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def open_ida(input_path: Path):
|
||||
import tempfile
|
||||
|
||||
import idapro
|
||||
|
||||
t = Path(tempfile.mkdtemp(prefix="ida-")) / input_path.name
|
||||
t.write_bytes(input_path.read_bytes())
|
||||
# resource leak: we should delete this upon exit
|
||||
|
||||
idapro.enable_console_messages(False)
|
||||
idapro.open_database(str(t.absolute()), run_auto_analysis=True)
|
||||
|
||||
import ida_auto
|
||||
|
||||
ida_auto.auto_wait()
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
# use n=8 to ignore common words
|
||||
N = 8
|
||||
|
||||
input_path = Path(sys.argv[1])
|
||||
|
||||
dbs = capa.analysis.strings.get_default_databases()
|
||||
capa.analysis.strings.prune_databases(dbs, n=N)
|
||||
|
||||
strings_by_library = collections.defaultdict(set)
|
||||
for string in capa.analysis.strings.extract_strings(input_path.read_bytes(), n=N):
|
||||
for db in dbs:
|
||||
if metadata := db.metadata_by_string.get(string.s):
|
||||
strings_by_library[metadata.library_name].add(string.s)
|
||||
|
||||
console = rich.get_console()
|
||||
console.print("found libraries:", style="bold")
|
||||
for library, strings in sorted(strings_by_library.items(), key=lambda p: len(p[1]), reverse=True):
|
||||
console.print(f" - [b]{library}[/] ({len(strings)} strings)")
|
||||
|
||||
for string in sorted(strings)[:10]:
|
||||
console.print(f" - {string}", markup=False, style="grey37")
|
||||
|
||||
if len(strings) > 10:
|
||||
console.print(" ...", style="grey37")
|
||||
|
||||
if not strings_by_library:
|
||||
console.print(" (none)", style="grey37")
|
||||
# since we're not going to find any strings
|
||||
# return early and don't do IDA analysis
|
||||
return
|
||||
|
||||
open_ida(input_path)
|
||||
|
||||
import idaapi
|
||||
import idautils
|
||||
import ida_funcs
|
||||
|
||||
strings_by_function = collections.defaultdict(set)
|
||||
for ea in idautils.Functions():
|
||||
f = idaapi.get_func(ea)
|
||||
|
||||
# ignore library functions and thunk functions as identified by IDA
|
||||
if f.flags & idaapi.FUNC_THUNK:
|
||||
continue
|
||||
if f.flags & idaapi.FUNC_LIB:
|
||||
continue
|
||||
|
||||
for bb in ida_helpers.get_function_blocks(f):
|
||||
for insn in ida_helpers.get_instructions_in_range(bb.start_ea, bb.end_ea):
|
||||
ref = capa.features.extractors.ida.helpers.find_data_reference_from_insn(insn)
|
||||
if ref == insn.ea:
|
||||
continue
|
||||
|
||||
string = capa.features.extractors.ida.helpers.find_string_at(ref)
|
||||
if not string:
|
||||
continue
|
||||
|
||||
for db in dbs:
|
||||
if metadata := db.metadata_by_string.get(string):
|
||||
strings_by_function[ea].add(string)
|
||||
|
||||
# ensure there are at least XXX functions renamed, or ignore those entries
|
||||
|
||||
console.print("functions:", style="bold")
|
||||
for function, strings in sorted(strings_by_function.items()):
|
||||
if strings:
|
||||
name = ida_funcs.get_func_name(function)
|
||||
|
||||
console.print(f" [b]{name}[/]@{function:08x}:")
|
||||
|
||||
for string in strings:
|
||||
for db in dbs:
|
||||
if metadata := db.metadata_by_string.get(string):
|
||||
location = Text(
|
||||
f"{metadata.library_name}@{metadata.library_version}::{metadata.function_name}",
|
||||
style="grey37",
|
||||
)
|
||||
console.print(" - ", location, ": ", string.rstrip())
|
||||
|
||||
console.print()
|
||||
|
||||
console.print(
|
||||
f"found {len(strings_by_function)} library functions across {len(list(idautils.Functions()))} functions"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
capa/analysis/strings/data/crt/msvc_v143.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/crt/msvc_v143.jsonl.gz
Normal file
Binary file not shown.
3
capa/analysis/strings/data/oss/.gitignore
vendored
Normal file
3
capa/analysis/strings/data/oss/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
*.csv
|
||||
*.jsonl
|
||||
*.jsonl.gz
|
||||
BIN
capa/analysis/strings/data/oss/brotli.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/brotli.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/bzip2.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/bzip2.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/cryptopp.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/cryptopp.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/curl.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/curl.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/detours.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/detours.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/jemalloc.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/jemalloc.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/jsoncpp.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/jsoncpp.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/kcp.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/kcp.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/liblzma.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/liblzma.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/libpcap.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/libpcap.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/libsodium.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/libsodium.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/mbedtls.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/mbedtls.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/openssl.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/openssl.jsonl.gz
Normal file
Binary file not shown.
99
capa/analysis/strings/data/oss/readme.md
Normal file
99
capa/analysis/strings/data/oss/readme.md
Normal file
@@ -0,0 +1,99 @@
|
||||
# Strings from Open Source libraries
|
||||
|
||||
This directory contains databases of strings extracted from open soure software.
|
||||
capa uses these databases to ignore functions that are likely library code.
|
||||
|
||||
There is one file for each database. Each database is a gzip-compressed, JSONL (one JSON document per line) file.
|
||||
The JSON document looks like this:
|
||||
|
||||
string: "1.0.8, 13-Jul-2019"
|
||||
library_name: "bzip2"
|
||||
library_version: "1.0.8#3"
|
||||
file_path: "CMakeFiles/bz2.dir/bzlib.c.obj"
|
||||
function_name: "BZ2_bzlibVersion"
|
||||
line_number: null
|
||||
|
||||
The following databases were extracted via the vkpkg & jh technique:
|
||||
|
||||
- brotli 1.0.9#5
|
||||
- bzip2 1.0.8#3
|
||||
- cryptopp 8.7.0
|
||||
- curl 7.86.0#1
|
||||
- detours 4.0.1#7
|
||||
- jemalloc 5.3.0#1
|
||||
- jsoncpp 1.9.5
|
||||
- kcp 1.7
|
||||
- liblzma 5.2.5#6
|
||||
- libsodium 1.0.18#8
|
||||
- libpcap 1.10.1#3
|
||||
- mbedtls 2.28.1
|
||||
- openssl 3.0.7#1
|
||||
- sqlite3 3.40.0#1
|
||||
- tomcrypt 1.18.2#2
|
||||
- wolfssl 5.5.0
|
||||
- zlib 1.2.13
|
||||
|
||||
This code was originally developed in FLOSS and imported into capa.
|
||||
|
||||
## The vkpkg & jh technique
|
||||
|
||||
Major steps:
|
||||
|
||||
1. build static libraries via vcpkg
|
||||
2. extract features via jh
|
||||
3. convert to JSONL format with `jh_to_qs.py`
|
||||
4. compress with gzip
|
||||
|
||||
### Build static libraries via vcpkg
|
||||
|
||||
[vcpkg](https://vcpkg.io/en/) is a free C/C++ package manager for acquiring and managing libraries.
|
||||
We use it to easily build common open source libraries, like zlib.
|
||||
Use the triplet `x64-windows-static` to build static archives (.lib files that are AR archives containing COFF object files):
|
||||
|
||||
```console
|
||||
PS > C:\vcpkg\vcpkg.exe install --triplet x64-windows-static zlib
|
||||
```
|
||||
|
||||
### Extract features via jh
|
||||
|
||||
[jh](https://github.com/williballenthin/lancelot/blob/master/bin/src/bin/jh.rs)
|
||||
is a lancelot-based utility that parses AR archives containing COFF object files,
|
||||
reconstructs their control flow, finds functions, and extracts features.
|
||||
jh extracts numbers, API calls, and strings; we are only interested in the string features.
|
||||
|
||||
For each feature, jh emits a CSV line with the fields
|
||||
- target triplet
|
||||
- compiler
|
||||
- library
|
||||
- version
|
||||
- build profile
|
||||
- path
|
||||
- function
|
||||
- feature type
|
||||
- feature value
|
||||
|
||||
For example:
|
||||
|
||||
```csv
|
||||
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0x00000100
|
||||
```
|
||||
|
||||
For example, to invoke jh:
|
||||
|
||||
```console
|
||||
$ ~/lancelot/target/release/jh x64-windows-static msvc143 zlib 1.2.13 release /mnt/c/vcpkg/installed/x64-windows-static/lib/zlib.lib > ~/flare-floss/floss/qs/db/data/oss/zlib.csv
|
||||
```
|
||||
|
||||
### Convert to OSS database format
|
||||
|
||||
We use the script `jh_to_qs.py` to convert these CSV lines into JSONL file prepared for FLOSS:
|
||||
|
||||
```console
|
||||
$ python3 jh_to_qs.py zlib.csv > zlib.jsonl
|
||||
```
|
||||
|
||||
These files are then gzip'd:
|
||||
|
||||
```console
|
||||
$ gzip -c zlib.jsonl > zlib.jsonl.gz
|
||||
```
|
||||
BIN
capa/analysis/strings/data/oss/sqlite3.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/sqlite3.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/tomcrypt.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/tomcrypt.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/wolfssl.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/wolfssl.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/oss/zlib.jsonl.gz
Normal file
BIN
capa/analysis/strings/data/oss/zlib.jsonl.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/winapi/apis.txt.gz
Normal file
BIN
capa/analysis/strings/data/winapi/apis.txt.gz
Normal file
Binary file not shown.
BIN
capa/analysis/strings/data/winapi/dlls.txt.gz
Normal file
BIN
capa/analysis/strings/data/winapi/dlls.txt.gz
Normal file
Binary file not shown.
@@ -466,6 +466,7 @@ FORMAT_VMRAY = "vmray"
|
||||
FORMAT_BINEXPORT2 = "binexport2"
|
||||
FORMAT_FREEZE = "freeze"
|
||||
FORMAT_RESULT = "result"
|
||||
FORMAT_BINJA_DB = "binja_database"
|
||||
STATIC_FORMATS = {
|
||||
FORMAT_SC32,
|
||||
FORMAT_SC64,
|
||||
@@ -475,6 +476,7 @@ STATIC_FORMATS = {
|
||||
FORMAT_FREEZE,
|
||||
FORMAT_RESULT,
|
||||
FORMAT_BINEXPORT2,
|
||||
FORMAT_BINJA_DB,
|
||||
}
|
||||
DYNAMIC_FORMATS = {
|
||||
FORMAT_CAPE,
|
||||
|
||||
@@ -280,11 +280,13 @@ class BinExport2Analysis:
|
||||
curr_idx: int = idx
|
||||
for _ in range(capa.features.common.THUNK_CHAIN_DEPTH_DELTA):
|
||||
thunk_callees: list[int] = self.idx.callees_by_vertex_index[curr_idx]
|
||||
# if this doesn't hold, then it doesn't seem like this is a thunk,
|
||||
# If this doesn't hold, then it doesn't seem like this is a thunk,
|
||||
# because either, len is:
|
||||
# 0 and the thunk doesn't point to anything, or
|
||||
# 0 and the thunk doesn't point to anything or is indirect, like `call eax`, or
|
||||
# >1 and the thunk may end up at many functions.
|
||||
assert len(thunk_callees) == 1, f"thunk @ {hex(addr)} failed"
|
||||
# In any case, this doesn't appear to be the sort of thunk we're looking for.
|
||||
if len(thunk_callees) != 1:
|
||||
break
|
||||
|
||||
thunked_idx: int = thunk_callees[0]
|
||||
thunked_vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[thunked_idx]
|
||||
|
||||
@@ -50,6 +50,25 @@ def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGr
|
||||
return vertex.HasField("type") and vertex.type == type_
|
||||
|
||||
|
||||
# internal to `build_expression_tree`
|
||||
# this is unstable: it is subject to change, so don't rely on it!
|
||||
def _prune_expression_tree_references_to_tree_index(
|
||||
expression_tree: list[list[int]],
|
||||
tree_index: int,
|
||||
):
|
||||
# `i` is the index of the tree node that we'll search for `tree_index`
|
||||
# if we remove `tree_index` from it, and it is now empty,
|
||||
# then we'll need to prune references to `i`.
|
||||
for i, tree_node in enumerate(expression_tree):
|
||||
if tree_index in tree_node:
|
||||
tree_node.remove(tree_index)
|
||||
|
||||
if len(tree_node) == 0:
|
||||
# if the parent node is now empty,
|
||||
# remove references to that parent node.
|
||||
_prune_expression_tree_references_to_tree_index(expression_tree, i)
|
||||
|
||||
|
||||
# internal to `build_expression_tree`
|
||||
# this is unstable: it is subject to change, so don't rely on it!
|
||||
def _prune_expression_tree_empty_shifts(
|
||||
@@ -70,9 +89,7 @@ def _prune_expression_tree_empty_shifts(
|
||||
#
|
||||
# Which seems to be as if the shift wasn't there (shift of #0)
|
||||
# so we want to remove references to this node from any parent nodes.
|
||||
for tree_node in expression_tree:
|
||||
if tree_index in tree_node:
|
||||
tree_node.remove(tree_index)
|
||||
_prune_expression_tree_references_to_tree_index(expression_tree, tree_index)
|
||||
|
||||
return
|
||||
|
||||
@@ -82,7 +99,20 @@ def _prune_expression_tree_empty_shifts(
|
||||
|
||||
# internal to `build_expression_tree`
|
||||
# this is unstable: it is subject to change, so don't rely on it!
|
||||
def _prune_expression_tree_empty_commas(
|
||||
def _fixup_expression_tree_references_to_tree_index(
|
||||
expression_tree: list[list[int]],
|
||||
existing_index: int,
|
||||
new_index: int,
|
||||
):
|
||||
for tree_node in expression_tree:
|
||||
for i, index in enumerate(tree_node):
|
||||
if index == existing_index:
|
||||
tree_node[i] = new_index
|
||||
|
||||
|
||||
# internal to `build_expression_tree`
|
||||
# this is unstable: it is subject to change, so don't rely on it!
|
||||
def _fixup_expression_tree_lonely_commas(
|
||||
be2: BinExport2,
|
||||
operand: BinExport2.Operand,
|
||||
expression_tree: list[list[int]],
|
||||
@@ -94,26 +124,12 @@ def _prune_expression_tree_empty_commas(
|
||||
|
||||
if expression.type == BinExport2.Expression.OPERATOR:
|
||||
if len(children_tree_indexes) == 1 and expression.symbol == ",":
|
||||
# Due to the above pruning of empty LSL or LSR expressions,
|
||||
# the parents might need to be fixed up.
|
||||
#
|
||||
# Specifically, if the pruned node was part of a comma list with two children,
|
||||
# now there's only a single child, which renders as an extra comma,
|
||||
# so we replace references to the comma node with the immediate child.
|
||||
#
|
||||
# A more correct way of doing this might be to walk up the parents and do fixups,
|
||||
# but I'm not quite sure how to do this yet. Just do two passes right now.
|
||||
child = children_tree_indexes[0]
|
||||
|
||||
for tree_node in expression_tree:
|
||||
tree_node.index
|
||||
if tree_index in tree_node:
|
||||
tree_node[tree_node.index(tree_index)] = child
|
||||
|
||||
return
|
||||
existing_index = tree_index
|
||||
new_index = children_tree_indexes[0]
|
||||
_fixup_expression_tree_references_to_tree_index(expression_tree, existing_index, new_index)
|
||||
|
||||
for child_tree_index in children_tree_indexes:
|
||||
_prune_expression_tree_empty_commas(be2, operand, expression_tree, child_tree_index)
|
||||
_fixup_expression_tree_lonely_commas(be2, operand, expression_tree, child_tree_index)
|
||||
|
||||
|
||||
# internal to `build_expression_tree`
|
||||
@@ -124,7 +140,7 @@ def _prune_expression_tree(
|
||||
expression_tree: list[list[int]],
|
||||
):
|
||||
_prune_expression_tree_empty_shifts(be2, operand, expression_tree, 0)
|
||||
_prune_expression_tree_empty_commas(be2, operand, expression_tree, 0)
|
||||
_fixup_expression_tree_lonely_commas(be2, operand, expression_tree, 0)
|
||||
|
||||
|
||||
# this is unstable: it is subject to change, so don't rely on it!
|
||||
@@ -173,7 +189,6 @@ def _build_expression_tree(
|
||||
tree.append(children)
|
||||
|
||||
_prune_expression_tree(be2, operand, tree)
|
||||
_prune_expression_tree(be2, operand, tree)
|
||||
|
||||
return tree
|
||||
|
||||
@@ -193,9 +208,22 @@ def _fill_operand_expression_list(
|
||||
children_tree_indexes: list[int] = expression_tree[tree_index]
|
||||
|
||||
if expression.type == BinExport2.Expression.REGISTER:
|
||||
assert len(children_tree_indexes) == 0
|
||||
assert len(children_tree_indexes) <= 1
|
||||
expression_list.append(expression)
|
||||
return
|
||||
|
||||
if len(children_tree_indexes) == 0:
|
||||
return
|
||||
elif len(children_tree_indexes) == 1:
|
||||
# like for aarch64 with vector instructions, indicating vector data size:
|
||||
#
|
||||
# FADD V0.4S, V1.4S, V2.4S
|
||||
#
|
||||
# see: https://github.com/mandiant/capa/issues/2528
|
||||
child_index = children_tree_indexes[0]
|
||||
_fill_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
|
||||
return
|
||||
else:
|
||||
raise NotImplementedError(len(children_tree_indexes))
|
||||
|
||||
elif expression.type == BinExport2.Expression.SYMBOL:
|
||||
assert len(children_tree_indexes) <= 1
|
||||
@@ -218,9 +246,23 @@ def _fill_operand_expression_list(
|
||||
raise NotImplementedError(len(children_tree_indexes))
|
||||
|
||||
elif expression.type == BinExport2.Expression.IMMEDIATE_INT:
|
||||
assert len(children_tree_indexes) == 0
|
||||
assert len(children_tree_indexes) <= 1
|
||||
expression_list.append(expression)
|
||||
return
|
||||
|
||||
if len(children_tree_indexes) == 0:
|
||||
return
|
||||
elif len(children_tree_indexes) == 1:
|
||||
# the ghidra exporter can produce some weird expressions,
|
||||
# particularly for MSRs, like for:
|
||||
#
|
||||
# sreg(3, 0, c.0, c.4, 4)
|
||||
#
|
||||
# see: https://github.com/mandiant/capa/issues/2530
|
||||
child_index = children_tree_indexes[0]
|
||||
_fill_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
|
||||
return
|
||||
else:
|
||||
raise NotImplementedError(len(children_tree_indexes))
|
||||
|
||||
elif expression.type == BinExport2.Expression.SIZE_PREFIX:
|
||||
# like: b4
|
||||
|
||||
@@ -5,111 +5,21 @@
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
|
||||
import string
|
||||
from typing import Iterator
|
||||
|
||||
from binaryninja import Function
|
||||
from binaryninja import BasicBlock as BinjaBasicBlock
|
||||
from binaryninja import (
|
||||
BinaryView,
|
||||
SymbolType,
|
||||
RegisterValueType,
|
||||
VariableSourceType,
|
||||
MediumLevelILOperation,
|
||||
MediumLevelILBasicBlock,
|
||||
MediumLevelILInstruction,
|
||||
)
|
||||
|
||||
from capa.features.common import Feature, Characteristic
|
||||
from capa.features.address import Address
|
||||
from capa.features.basicblock import BasicBlock
|
||||
from capa.features.extractors.helpers import MIN_STACKSTRING_LEN
|
||||
from capa.features.extractors.base_extractor import BBHandle, FunctionHandle
|
||||
|
||||
|
||||
def get_printable_len_ascii(s: bytes) -> int:
|
||||
"""Return string length if all operand bytes are ascii or utf16-le printable"""
|
||||
count = 0
|
||||
for c in s:
|
||||
if c == 0:
|
||||
return count
|
||||
if c < 127 and chr(c) in string.printable:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def get_printable_len_wide(s: bytes) -> int:
|
||||
"""Return string length if all operand bytes are ascii or utf16-le printable"""
|
||||
if all(c == 0x00 for c in s[1::2]):
|
||||
return get_printable_len_ascii(s[::2])
|
||||
return 0
|
||||
|
||||
|
||||
def get_stack_string_len(f: Function, il: MediumLevelILInstruction) -> int:
|
||||
bv: BinaryView = f.view
|
||||
|
||||
if il.operation != MediumLevelILOperation.MLIL_CALL:
|
||||
return 0
|
||||
|
||||
target = il.dest
|
||||
if target.operation not in [MediumLevelILOperation.MLIL_CONST, MediumLevelILOperation.MLIL_CONST_PTR]:
|
||||
return 0
|
||||
|
||||
addr = target.value.value
|
||||
sym = bv.get_symbol_at(addr)
|
||||
if not sym or sym.type not in [SymbolType.LibraryFunctionSymbol, SymbolType.SymbolicFunctionSymbol]:
|
||||
return 0
|
||||
|
||||
if sym.name not in ["__builtin_strncpy", "__builtin_strcpy", "__builtin_wcscpy"]:
|
||||
return 0
|
||||
|
||||
if len(il.params) < 2:
|
||||
return 0
|
||||
|
||||
dest = il.params[0]
|
||||
if dest.operation in [MediumLevelILOperation.MLIL_ADDRESS_OF, MediumLevelILOperation.MLIL_VAR]:
|
||||
var = dest.src
|
||||
else:
|
||||
return 0
|
||||
|
||||
if var.source_type != VariableSourceType.StackVariableSourceType:
|
||||
return 0
|
||||
|
||||
src = il.params[1]
|
||||
if src.value.type != RegisterValueType.ConstantDataAggregateValue:
|
||||
return 0
|
||||
|
||||
s = f.get_constant_data(RegisterValueType.ConstantDataAggregateValue, src.value.value)
|
||||
return max(get_printable_len_ascii(bytes(s)), get_printable_len_wide(bytes(s)))
|
||||
|
||||
|
||||
def bb_contains_stackstring(f: Function, bb: MediumLevelILBasicBlock) -> bool:
|
||||
"""check basic block for stackstring indicators
|
||||
|
||||
true if basic block contains enough moves of constant bytes to the stack
|
||||
"""
|
||||
count = 0
|
||||
for il in bb:
|
||||
count += get_stack_string_len(f, il)
|
||||
if count > MIN_STACKSTRING_LEN:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def extract_bb_stackstring(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
"""extract stackstring indicators from basic block"""
|
||||
bb: tuple[BinjaBasicBlock, MediumLevelILBasicBlock] = bbh.inner
|
||||
if bb[1] is not None and bb_contains_stackstring(fh.inner, bb[1]):
|
||||
yield Characteristic("stack string"), bbh.address
|
||||
|
||||
|
||||
def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
"""extract tight loop indicators from a basic block"""
|
||||
bb: tuple[BinjaBasicBlock, MediumLevelILBasicBlock] = bbh.inner
|
||||
for edge in bb[0].outgoing_edges:
|
||||
if edge.target.start == bb[0].start:
|
||||
bb: BinjaBasicBlock = bbh.inner
|
||||
for edge in bb.outgoing_edges:
|
||||
if edge.target.start == bb.start:
|
||||
yield Characteristic("tight loop"), bbh.address
|
||||
|
||||
|
||||
@@ -121,7 +31,4 @@ def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Featur
|
||||
yield BasicBlock(), bbh.address
|
||||
|
||||
|
||||
BASIC_BLOCK_HANDLERS = (
|
||||
extract_bb_tight_loop,
|
||||
extract_bb_stackstring,
|
||||
)
|
||||
BASIC_BLOCK_HANDLERS = (extract_bb_tight_loop,)
|
||||
|
||||
@@ -5,19 +5,14 @@
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
import logging
|
||||
from typing import Iterator
|
||||
from collections import defaultdict
|
||||
|
||||
import binaryninja as binja
|
||||
from binaryninja import Function, BinaryView, SymbolType, ILException, RegisterValueType, LowLevelILOperation
|
||||
|
||||
import capa.perf
|
||||
import capa.features.extractors.elf
|
||||
import capa.features.extractors.binja.file
|
||||
import capa.features.extractors.binja.insn
|
||||
import capa.features.extractors.binja.global_
|
||||
import capa.features.extractors.binja.helpers
|
||||
import capa.features.extractors.binja.function
|
||||
import capa.features.extractors.binja.basicblock
|
||||
from capa.features.common import Feature
|
||||
@@ -30,8 +25,6 @@ from capa.features.extractors.base_extractor import (
|
||||
StaticFeatureExtractor,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BinjaFeatureExtractor(StaticFeatureExtractor):
|
||||
def __init__(self, bv: binja.BinaryView):
|
||||
@@ -42,9 +35,6 @@ class BinjaFeatureExtractor(StaticFeatureExtractor):
|
||||
self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv))
|
||||
self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv))
|
||||
|
||||
with capa.perf.timing("binary ninja: computing call graph"):
|
||||
self._call_graph = self._build_call_graph()
|
||||
|
||||
def get_base_address(self):
|
||||
return AbsoluteVirtualAddress(self.bv.start)
|
||||
|
||||
@@ -54,65 +44,29 @@ class BinjaFeatureExtractor(StaticFeatureExtractor):
|
||||
def extract_file_features(self):
|
||||
yield from capa.features.extractors.binja.file.extract_features(self.bv)
|
||||
|
||||
def _build_call_graph(self):
|
||||
# from function address to function addresses
|
||||
calls_from: defaultdict[int, set[int]] = defaultdict(set)
|
||||
calls_to: defaultdict[int, set[int]] = defaultdict(set)
|
||||
|
||||
f: Function
|
||||
for f in self.bv.functions:
|
||||
for caller in f.callers:
|
||||
calls_from[caller.start].add(f.start)
|
||||
calls_to[f.start].add(caller.start)
|
||||
|
||||
call_graph = {
|
||||
"calls_to": calls_to,
|
||||
"calls_from": calls_from,
|
||||
}
|
||||
|
||||
return call_graph
|
||||
|
||||
def get_functions(self) -> Iterator[FunctionHandle]:
|
||||
for f in self.bv.functions:
|
||||
yield FunctionHandle(address=AbsoluteVirtualAddress(f.start), inner=f, ctx={"call_graph": self._call_graph})
|
||||
yield FunctionHandle(address=AbsoluteVirtualAddress(f.start), inner=f)
|
||||
|
||||
def extract_function_features(self, fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
yield from capa.features.extractors.binja.function.extract_features(fh)
|
||||
|
||||
def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
|
||||
f: binja.Function = fh.inner
|
||||
# Set up a MLIL basic block dict look up to associate the disassembly basic block with its MLIL basic block
|
||||
mlil_lookup = {}
|
||||
try:
|
||||
mlil = f.mlil
|
||||
except ILException:
|
||||
return
|
||||
|
||||
if mlil is None:
|
||||
return
|
||||
|
||||
for mlil_bb in mlil.basic_blocks:
|
||||
mlil_lookup[mlil_bb.source_block.start] = mlil_bb
|
||||
|
||||
for bb in f.basic_blocks:
|
||||
mlil_bb = mlil_lookup.get(bb.start)
|
||||
|
||||
yield BBHandle(address=AbsoluteVirtualAddress(bb.start), inner=(bb, mlil_bb))
|
||||
yield BBHandle(address=AbsoluteVirtualAddress(bb.start), inner=bb)
|
||||
|
||||
def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
yield from capa.features.extractors.binja.basicblock.extract_features(fh, bbh)
|
||||
|
||||
def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
|
||||
f: binja.Function = fh.inner
|
||||
import capa.features.extractors.binja.helpers as binja_helpers
|
||||
|
||||
bb: binja.BasicBlock
|
||||
mlbb: binja.MediumLevelILBasicBlock
|
||||
bb, mlbb = bbh.inner
|
||||
bb: binja.BasicBlock = bbh.inner
|
||||
addr = bb.start
|
||||
|
||||
addr: int = bb.start
|
||||
for text, length in bb:
|
||||
llil = f.get_llils_at(addr)
|
||||
insn = capa.features.extractors.binja.helpers.DisassemblyInstruction(addr, length, text, llil)
|
||||
insn = binja_helpers.DisassemblyInstruction(addr, length, text)
|
||||
yield InsnHandle(address=AbsoluteVirtualAddress(addr), inner=insn)
|
||||
addr += length
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ from capa.features.common import (
|
||||
FORMAT_ELF,
|
||||
FORMAT_SC32,
|
||||
FORMAT_SC64,
|
||||
FORMAT_BINJA_DB,
|
||||
Format,
|
||||
String,
|
||||
Feature,
|
||||
@@ -137,6 +138,9 @@ def extract_file_function_names(bv: BinaryView) -> Iterator[tuple[Feature, Addre
|
||||
|
||||
|
||||
def extract_file_format(bv: BinaryView) -> Iterator[tuple[Feature, Address]]:
|
||||
if bv.file.database is not None:
|
||||
yield Format(FORMAT_BINJA_DB), NO_ADDRESS
|
||||
|
||||
view_type = bv.view_type
|
||||
if view_type in ["PE", "COFF"]:
|
||||
yield Format(FORMAT_PE), NO_ADDRESS
|
||||
|
||||
@@ -105,13 +105,13 @@ def find_binaryninja() -> Optional[Path]:
|
||||
logger.debug("detected OS: linux")
|
||||
elif sys.platform == "darwin":
|
||||
logger.warning("unsupported platform to find Binary Ninja: %s", sys.platform)
|
||||
return False
|
||||
return None
|
||||
elif sys.platform == "win32":
|
||||
logger.warning("unsupported platform to find Binary Ninja: %s", sys.platform)
|
||||
return False
|
||||
return None
|
||||
else:
|
||||
logger.warning("unsupported platform to find Binary Ninja: %s", sys.platform)
|
||||
return False
|
||||
return None
|
||||
|
||||
desktop_entry = get_desktop_entry("com.vector35.binaryninja.desktop")
|
||||
if not desktop_entry:
|
||||
|
||||
@@ -5,14 +5,28 @@
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
import string
|
||||
from typing import Iterator
|
||||
|
||||
from binaryninja import Function, BinaryView, SymbolType
|
||||
from binaryninja import (
|
||||
Function,
|
||||
BinaryView,
|
||||
SymbolType,
|
||||
ILException,
|
||||
RegisterValueType,
|
||||
VariableSourceType,
|
||||
LowLevelILOperation,
|
||||
MediumLevelILOperation,
|
||||
MediumLevelILBasicBlock,
|
||||
MediumLevelILInstruction,
|
||||
)
|
||||
|
||||
from capa.features.file import FunctionName
|
||||
from capa.features.common import Feature, Characteristic
|
||||
from capa.features.address import Address, AbsoluteVirtualAddress
|
||||
from capa.features.extractors import loops
|
||||
from capa.features.extractors.helpers import MIN_STACKSTRING_LEN
|
||||
from capa.features.extractors.binja.helpers import get_llil_instr_at_addr
|
||||
from capa.features.extractors.base_extractor import FunctionHandle
|
||||
|
||||
|
||||
@@ -20,24 +34,30 @@ def extract_function_calls_to(fh: FunctionHandle):
|
||||
"""extract callers to a function"""
|
||||
func: Function = fh.inner
|
||||
|
||||
caller: int
|
||||
for caller in fh.ctx["call_graph"]["calls_to"].get(func.start, []):
|
||||
if caller == func.start:
|
||||
for caller in func.caller_sites:
|
||||
# Everything that is a code reference to the current function is considered a caller, which actually includes
|
||||
# many other references that are NOT a caller. For example, an instruction `push function_start` will also be
|
||||
# considered a caller to the function
|
||||
llil = get_llil_instr_at_addr(func.view, caller.address)
|
||||
if (llil is None) or llil.operation not in [
|
||||
LowLevelILOperation.LLIL_CALL,
|
||||
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
|
||||
LowLevelILOperation.LLIL_JUMP,
|
||||
LowLevelILOperation.LLIL_TAILCALL,
|
||||
]:
|
||||
continue
|
||||
|
||||
yield Characteristic("calls to"), AbsoluteVirtualAddress(caller)
|
||||
|
||||
|
||||
def extract_function_calls_from(fh: FunctionHandle):
|
||||
"""extract callers from a function"""
|
||||
func: Function = fh.inner
|
||||
|
||||
callee: int
|
||||
for callee in fh.ctx["call_graph"]["calls_from"].get(func.start, []):
|
||||
if callee == func.start:
|
||||
if llil.dest.operation not in [
|
||||
LowLevelILOperation.LLIL_CONST,
|
||||
LowLevelILOperation.LLIL_CONST_PTR,
|
||||
]:
|
||||
continue
|
||||
|
||||
yield Characteristic("calls from"), AbsoluteVirtualAddress(callee)
|
||||
address = llil.dest.constant
|
||||
if address != func.start:
|
||||
continue
|
||||
|
||||
yield Characteristic("calls to"), AbsoluteVirtualAddress(caller.address)
|
||||
|
||||
|
||||
def extract_function_loop(fh: FunctionHandle):
|
||||
@@ -58,12 +78,13 @@ def extract_function_loop(fh: FunctionHandle):
|
||||
def extract_recursive_call(fh: FunctionHandle):
|
||||
"""extract recursive function call"""
|
||||
func: Function = fh.inner
|
||||
bv: BinaryView = func.view
|
||||
if bv is None:
|
||||
return
|
||||
|
||||
caller: int
|
||||
for caller in fh.ctx["call_graph"]["calls_to"].get(func.start, []):
|
||||
if caller == func.start:
|
||||
for ref in bv.get_code_refs(func.start):
|
||||
if ref.function == func:
|
||||
yield Characteristic("recursive call"), fh.address
|
||||
return
|
||||
|
||||
|
||||
def extract_function_name(fh: FunctionHandle):
|
||||
@@ -87,6 +108,93 @@ def extract_function_name(fh: FunctionHandle):
|
||||
yield FunctionName(name[1:]), sym.address
|
||||
|
||||
|
||||
def get_printable_len_ascii(s: bytes) -> int:
|
||||
"""Return string length if all operand bytes are ascii or utf16-le printable"""
|
||||
count = 0
|
||||
for c in s:
|
||||
if c == 0:
|
||||
return count
|
||||
if c < 127 and chr(c) in string.printable:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def get_printable_len_wide(s: bytes) -> int:
|
||||
"""Return string length if all operand bytes are ascii or utf16-le printable"""
|
||||
if all(c == 0x00 for c in s[1::2]):
|
||||
return get_printable_len_ascii(s[::2])
|
||||
return 0
|
||||
|
||||
|
||||
def get_stack_string_len(f: Function, il: MediumLevelILInstruction) -> int:
|
||||
bv: BinaryView = f.view
|
||||
|
||||
if il.operation != MediumLevelILOperation.MLIL_CALL:
|
||||
return 0
|
||||
|
||||
target = il.dest
|
||||
if target.operation not in [MediumLevelILOperation.MLIL_CONST, MediumLevelILOperation.MLIL_CONST_PTR]:
|
||||
return 0
|
||||
|
||||
addr = target.value.value
|
||||
sym = bv.get_symbol_at(addr)
|
||||
if not sym or sym.type not in [SymbolType.LibraryFunctionSymbol, SymbolType.SymbolicFunctionSymbol]:
|
||||
return 0
|
||||
|
||||
if sym.name not in ["__builtin_strncpy", "__builtin_strcpy", "__builtin_wcscpy"]:
|
||||
return 0
|
||||
|
||||
if len(il.params) < 2:
|
||||
return 0
|
||||
|
||||
dest = il.params[0]
|
||||
if dest.operation in [MediumLevelILOperation.MLIL_ADDRESS_OF, MediumLevelILOperation.MLIL_VAR]:
|
||||
var = dest.src
|
||||
else:
|
||||
return 0
|
||||
|
||||
if var.source_type != VariableSourceType.StackVariableSourceType:
|
||||
return 0
|
||||
|
||||
src = il.params[1]
|
||||
if src.value.type != RegisterValueType.ConstantDataAggregateValue:
|
||||
return 0
|
||||
|
||||
s = f.get_constant_data(RegisterValueType.ConstantDataAggregateValue, src.value.value)
|
||||
return max(get_printable_len_ascii(bytes(s)), get_printable_len_wide(bytes(s)))
|
||||
|
||||
|
||||
def bb_contains_stackstring(f: Function, bb: MediumLevelILBasicBlock) -> bool:
|
||||
"""check basic block for stackstring indicators
|
||||
|
||||
true if basic block contains enough moves of constant bytes to the stack
|
||||
"""
|
||||
count = 0
|
||||
for il in bb:
|
||||
count += get_stack_string_len(f, il)
|
||||
if count > MIN_STACKSTRING_LEN:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def extract_stackstring(fh: FunctionHandle):
|
||||
"""extract stackstring indicators"""
|
||||
func: Function = fh.inner
|
||||
bv: BinaryView = func.view
|
||||
if bv is None:
|
||||
return
|
||||
|
||||
try:
|
||||
mlil = func.mlil
|
||||
except ILException:
|
||||
return
|
||||
|
||||
for block in mlil.basic_blocks:
|
||||
if bb_contains_stackstring(func, block):
|
||||
yield Characteristic("stack string"), block.source_block.start
|
||||
|
||||
|
||||
def extract_features(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
for func_handler in FUNCTION_HANDLERS:
|
||||
for feature, addr in func_handler(fh):
|
||||
@@ -95,8 +203,8 @@ def extract_features(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
|
||||
FUNCTION_HANDLERS = (
|
||||
extract_function_calls_to,
|
||||
extract_function_calls_from,
|
||||
extract_function_loop,
|
||||
extract_recursive_call,
|
||||
extract_function_name,
|
||||
extract_stackstring,
|
||||
)
|
||||
|
||||
@@ -6,10 +6,10 @@
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
import re
|
||||
from typing import Callable
|
||||
from typing import Callable, Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
from binaryninja import BinaryView, LowLevelILOperation, LowLevelILInstruction
|
||||
from binaryninja import BinaryView, LowLevelILFunction, LowLevelILInstruction
|
||||
from binaryninja.architecture import InstructionTextToken
|
||||
|
||||
|
||||
@@ -18,24 +18,6 @@ class DisassemblyInstruction:
|
||||
address: int
|
||||
length: int
|
||||
text: list[InstructionTextToken]
|
||||
llil: list[LowLevelILInstruction]
|
||||
|
||||
@property
|
||||
def is_call(self):
|
||||
if not self.llil:
|
||||
return False
|
||||
|
||||
# TODO(williballenthin): when to use one vs many llil instructions
|
||||
# https://github.com/Vector35/binaryninja-api/issues/6205
|
||||
llil = self.llil[0]
|
||||
if not llil:
|
||||
return False
|
||||
|
||||
return llil.operation in [
|
||||
LowLevelILOperation.LLIL_CALL,
|
||||
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
|
||||
LowLevelILOperation.LLIL_TAILCALL,
|
||||
]
|
||||
|
||||
|
||||
LLIL_VISITOR = Callable[[LowLevelILInstruction, LowLevelILInstruction, int], bool]
|
||||
@@ -85,3 +67,13 @@ def read_c_string(bv: BinaryView, offset: int, max_len: int) -> str:
|
||||
s.append(chr(c))
|
||||
|
||||
return "".join(s)
|
||||
|
||||
|
||||
def get_llil_instr_at_addr(bv: BinaryView, addr: int) -> Optional[LowLevelILInstruction]:
|
||||
arch = bv.arch
|
||||
buffer = bv.read(addr, arch.max_instr_length)
|
||||
llil = LowLevelILFunction(arch=arch)
|
||||
llil.current_address = addr
|
||||
if arch.get_instruction_low_level_il(buffer, addr, llil) == 0:
|
||||
return None
|
||||
return llil[0]
|
||||
|
||||
@@ -13,7 +13,6 @@ from binaryninja import (
|
||||
BinaryView,
|
||||
ILRegister,
|
||||
SymbolType,
|
||||
ILException,
|
||||
BinaryReader,
|
||||
RegisterValueType,
|
||||
LowLevelILOperation,
|
||||
@@ -23,8 +22,8 @@ from binaryninja import (
|
||||
import capa.features.extractors.helpers
|
||||
from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
|
||||
from capa.features.common import MAX_BYTES_FEATURE_SIZE, Bytes, String, Feature, Characteristic
|
||||
from capa.features.address import Address
|
||||
from capa.features.extractors.binja.helpers import DisassemblyInstruction, visit_llil_exprs
|
||||
from capa.features.address import Address, AbsoluteVirtualAddress
|
||||
from capa.features.extractors.binja.helpers import DisassemblyInstruction, visit_llil_exprs, get_llil_instr_at_addr
|
||||
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
|
||||
|
||||
# security cookie checks may perform non-zeroing XORs, these are expected within a certain
|
||||
@@ -32,46 +31,29 @@ from capa.features.extractors.base_extractor import BBHandle, InsnHandle, Functi
|
||||
SECURITY_COOKIE_BYTES_DELTA = 0x40
|
||||
|
||||
|
||||
# TODO: move this to call graph pass
|
||||
# check if a function is a stub function to another function/symbol. The criteria is:
|
||||
# 1. The function must only have one basic block
|
||||
# 2. The function must only make one call/jump to another address
|
||||
# If the function being checked is a stub function, returns the target address. Otherwise, return None.
|
||||
def is_stub_function(bv: BinaryView, addr: int) -> Optional[int]:
|
||||
funcs = bv.get_functions_at(addr)
|
||||
for func in funcs:
|
||||
if len(func.basic_blocks) != 1:
|
||||
continue
|
||||
llil = get_llil_instr_at_addr(bv, addr)
|
||||
if llil is None or llil.operation not in [
|
||||
LowLevelILOperation.LLIL_CALL,
|
||||
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
|
||||
LowLevelILOperation.LLIL_JUMP,
|
||||
LowLevelILOperation.LLIL_TAILCALL,
|
||||
]:
|
||||
return None
|
||||
|
||||
call_count = 0
|
||||
call_target = None
|
||||
try:
|
||||
llil = func.llil
|
||||
except ILException:
|
||||
return None
|
||||
# The LLIL instruction retrieved by `get_llil_instr_at_addr` did not go through a full analysis, so we cannot check
|
||||
# `llil.dest.value.type` here
|
||||
if llil.dest.operation not in [
|
||||
LowLevelILOperation.LLIL_CONST,
|
||||
LowLevelILOperation.LLIL_CONST_PTR,
|
||||
]:
|
||||
return None
|
||||
|
||||
if llil is None:
|
||||
continue
|
||||
|
||||
for il in llil.instructions:
|
||||
if il.operation in [
|
||||
LowLevelILOperation.LLIL_CALL,
|
||||
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
|
||||
LowLevelILOperation.LLIL_JUMP,
|
||||
LowLevelILOperation.LLIL_TAILCALL,
|
||||
]:
|
||||
call_count += 1
|
||||
if il.dest.value.type in [
|
||||
RegisterValueType.ImportedAddressValue,
|
||||
RegisterValueType.ConstantValue,
|
||||
RegisterValueType.ConstantPointerValue,
|
||||
]:
|
||||
call_target = il.dest.value.value
|
||||
|
||||
if call_count == 1 and call_target is not None:
|
||||
return call_target
|
||||
|
||||
return None
|
||||
return llil.dest.constant
|
||||
|
||||
|
||||
def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
@@ -83,9 +65,8 @@ def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle)
|
||||
"""
|
||||
func: Function = fh.inner
|
||||
bv: BinaryView = func.view
|
||||
insn: DisassemblyInstruction = ih.inner
|
||||
|
||||
for llil in insn.llil:
|
||||
for llil in func.get_llils_at(ih.address):
|
||||
if llil.operation in [
|
||||
LowLevelILOperation.LLIL_CALL,
|
||||
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
|
||||
@@ -140,11 +121,10 @@ def extract_insn_number_features(
|
||||
example:
|
||||
push 3136B0h ; dwControlCode
|
||||
"""
|
||||
insn: DisassemblyInstruction = ih.inner
|
||||
func: Function = fh.inner
|
||||
|
||||
results: list[tuple[Any[Number, OperandNumber], Address]] = []
|
||||
|
||||
# TODO: try to move this out of line
|
||||
def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool:
|
||||
if il.operation == LowLevelILOperation.LLIL_LOAD:
|
||||
return False
|
||||
@@ -168,7 +148,7 @@ def extract_insn_number_features(
|
||||
|
||||
return False
|
||||
|
||||
for llil in insn.llil:
|
||||
for llil in func.get_llils_at(ih.address):
|
||||
visit_llil_exprs(llil, llil_checker)
|
||||
|
||||
yield from results
|
||||
@@ -182,11 +162,11 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
|
||||
"""
|
||||
func: Function = fh.inner
|
||||
bv: BinaryView = func.view
|
||||
insn: DisassemblyInstruction = ih.inner
|
||||
|
||||
candidate_addrs = set()
|
||||
|
||||
if insn.is_call:
|
||||
llil = func.get_llil_at(ih.address)
|
||||
if llil is None or llil.operation in [LowLevelILOperation.LLIL_CALL, LowLevelILOperation.LLIL_CALL_STACK_ADJUST]:
|
||||
return
|
||||
|
||||
for ref in bv.get_code_refs_from(ih.address):
|
||||
@@ -208,7 +188,7 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
|
||||
|
||||
return True
|
||||
|
||||
for llil in insn.llil:
|
||||
for llil in func.get_llils_at(ih.address):
|
||||
visit_llil_exprs(llil, llil_checker)
|
||||
|
||||
for addr in candidate_addrs:
|
||||
@@ -230,7 +210,6 @@ def extract_insn_string_features(
|
||||
"""
|
||||
func: Function = fh.inner
|
||||
bv: BinaryView = func.view
|
||||
insn: DisassemblyInstruction = ih.inner
|
||||
|
||||
candidate_addrs = set()
|
||||
|
||||
@@ -254,7 +233,7 @@ def extract_insn_string_features(
|
||||
|
||||
return True
|
||||
|
||||
for llil in insn.llil:
|
||||
for llil in func.get_llils_at(ih.address):
|
||||
visit_llil_exprs(llil, llil_checker)
|
||||
|
||||
# Now we have all the candidate address, check them for string or pointer to string
|
||||
@@ -287,7 +266,6 @@ def extract_insn_offset_features(
|
||||
.text:0040112F cmp [esi+4], ebx
|
||||
"""
|
||||
func: Function = fh.inner
|
||||
insn: DisassemblyInstruction = ih.inner
|
||||
|
||||
results: list[tuple[Any[Offset, OperandOffset], Address]] = []
|
||||
address_size = func.view.arch.address_size * 8
|
||||
@@ -332,7 +310,7 @@ def extract_insn_offset_features(
|
||||
|
||||
return True
|
||||
|
||||
for llil in insn.llil:
|
||||
for llil in func.get_llils_at(ih.address):
|
||||
visit_llil_exprs(llil, llil_checker)
|
||||
|
||||
yield from results
|
||||
@@ -372,7 +350,7 @@ def extract_insn_nzxor_characteristic_features(
|
||||
parse instruction non-zeroing XOR instruction
|
||||
ignore expected non-zeroing XORs, e.g. security cookies
|
||||
"""
|
||||
insn: DisassemblyInstruction = ih.inner
|
||||
func: Function = fh.inner
|
||||
|
||||
results = []
|
||||
|
||||
@@ -381,14 +359,14 @@ def extract_insn_nzxor_characteristic_features(
|
||||
# e.g., <llil: eax = 0>, (LLIL_SET_REG). So we do not need to check whether the two operands are the same.
|
||||
if il.operation == LowLevelILOperation.LLIL_XOR:
|
||||
# Exclude cases related to the stack cookie
|
||||
if is_nzxor_stack_cookie(fh.inner, bbh.inner[0], il):
|
||||
if is_nzxor_stack_cookie(fh.inner, bbh.inner, il):
|
||||
return False
|
||||
results.append((Characteristic("nzxor"), ih.address))
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
for llil in insn.llil:
|
||||
for llil in func.get_llils_at(ih.address):
|
||||
visit_llil_exprs(llil, llil_checker)
|
||||
|
||||
yield from results
|
||||
@@ -420,7 +398,7 @@ def extract_insn_peb_access_characteristic_features(
|
||||
|
||||
fs:[0x30] on x86, gs:[0x60] on x64
|
||||
"""
|
||||
insn: DisassemblyInstruction = ih.inner
|
||||
func: Function = fh.inner
|
||||
|
||||
results = []
|
||||
|
||||
@@ -450,7 +428,7 @@ def extract_insn_peb_access_characteristic_features(
|
||||
results.append((Characteristic("peb access"), ih.address))
|
||||
return False
|
||||
|
||||
for llil in insn.llil:
|
||||
for llil in func.get_llils_at(ih.address):
|
||||
visit_llil_exprs(llil, llil_checker)
|
||||
|
||||
yield from results
|
||||
@@ -460,7 +438,7 @@ def extract_insn_segment_access_features(
|
||||
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
||||
) -> Iterator[tuple[Feature, Address]]:
|
||||
"""parse instruction fs or gs access"""
|
||||
insn: DisassemblyInstruction = ih.inner
|
||||
func: Function = fh.inner
|
||||
|
||||
results = []
|
||||
|
||||
@@ -477,7 +455,7 @@ def extract_insn_segment_access_features(
|
||||
|
||||
return True
|
||||
|
||||
for llil in insn.llil:
|
||||
for llil in func.get_llils_at(ih.address):
|
||||
visit_llil_exprs(llil, llil_checker)
|
||||
|
||||
yield from results
|
||||
@@ -505,6 +483,47 @@ def extract_insn_cross_section_cflow(
|
||||
yield Characteristic("cross section flow"), ih.address
|
||||
|
||||
|
||||
def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[tuple[Feature, Address]]:
|
||||
"""extract functions calls from features
|
||||
|
||||
most relevant at the function scope, however, its most efficient to extract at the instruction scope
|
||||
"""
|
||||
func: Function = fh.inner
|
||||
bv: BinaryView = func.view
|
||||
|
||||
if bv is None:
|
||||
return
|
||||
|
||||
for il in func.get_llils_at(ih.address):
|
||||
if il.operation not in [
|
||||
LowLevelILOperation.LLIL_CALL,
|
||||
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
|
||||
LowLevelILOperation.LLIL_TAILCALL,
|
||||
]:
|
||||
continue
|
||||
|
||||
dest = il.dest
|
||||
if dest.operation == LowLevelILOperation.LLIL_CONST_PTR:
|
||||
value = dest.value.value
|
||||
yield Characteristic("calls from"), AbsoluteVirtualAddress(value)
|
||||
elif dest.operation == LowLevelILOperation.LLIL_CONST:
|
||||
yield Characteristic("calls from"), AbsoluteVirtualAddress(dest.value)
|
||||
elif dest.operation == LowLevelILOperation.LLIL_LOAD:
|
||||
indirect_src = dest.src
|
||||
if indirect_src.operation == LowLevelILOperation.LLIL_CONST_PTR:
|
||||
value = indirect_src.value.value
|
||||
yield Characteristic("calls from"), AbsoluteVirtualAddress(value)
|
||||
elif indirect_src.operation == LowLevelILOperation.LLIL_CONST:
|
||||
yield Characteristic("calls from"), AbsoluteVirtualAddress(indirect_src.value)
|
||||
elif dest.operation == LowLevelILOperation.LLIL_REG:
|
||||
if dest.value.type in [
|
||||
RegisterValueType.ImportedAddressValue,
|
||||
RegisterValueType.ConstantValue,
|
||||
RegisterValueType.ConstantPointerValue,
|
||||
]:
|
||||
yield Characteristic("calls from"), AbsoluteVirtualAddress(dest.value.value)
|
||||
|
||||
|
||||
def extract_function_indirect_call_characteristic_features(
|
||||
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
|
||||
) -> Iterator[tuple[Feature, Address]]:
|
||||
@@ -514,15 +533,14 @@ def extract_function_indirect_call_characteristic_features(
|
||||
most relevant at the function or basic block scope;
|
||||
however, its most efficient to extract at the instruction scope
|
||||
"""
|
||||
insn: DisassemblyInstruction = ih.inner
|
||||
func: Function = fh.inner
|
||||
|
||||
if not insn.is_call:
|
||||
return
|
||||
|
||||
# TODO(williballenthin): when to use one vs many llil instructions
|
||||
# https://github.com/Vector35/binaryninja-api/issues/6205
|
||||
llil = insn.llil[0]
|
||||
if not llil:
|
||||
llil = func.get_llil_at(ih.address)
|
||||
if llil is None or llil.operation not in [
|
||||
LowLevelILOperation.LLIL_CALL,
|
||||
LowLevelILOperation.LLIL_CALL_STACK_ADJUST,
|
||||
LowLevelILOperation.LLIL_TAILCALL,
|
||||
]:
|
||||
return
|
||||
|
||||
if llil.dest.operation in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]:
|
||||
@@ -555,5 +573,6 @@ INSTRUCTION_HANDLERS = (
|
||||
extract_insn_peb_access_characteristic_features,
|
||||
extract_insn_cross_section_cflow,
|
||||
extract_insn_segment_access_features,
|
||||
extract_function_calls_from,
|
||||
extract_function_indirect_call_characteristic_features,
|
||||
)
|
||||
|
||||
@@ -297,7 +297,10 @@ class Call(ExactModel):
|
||||
id: int
|
||||
|
||||
|
||||
class Process(ExactModel):
|
||||
# FlexibleModel to account for extended fields
|
||||
# refs: https://github.com/mandiant/capa/issues/2466
|
||||
# https://github.com/kevoreilly/CAPEv2/pull/2199
|
||||
class Process(FlexibleModel):
|
||||
process_id: int
|
||||
process_name: str
|
||||
parent_id: int
|
||||
@@ -400,7 +403,7 @@ class CapeReport(FlexibleModel):
|
||||
CAPE: Optional[Union[Cape, list]] = None
|
||||
dropped: Optional[list[File]] = None
|
||||
procdump: Optional[list[ProcessFile]] = None
|
||||
procmemory: ListTODO
|
||||
procmemory: Optional[ListTODO] = None
|
||||
|
||||
# =========================================================================
|
||||
# information we won't use in capa
|
||||
|
||||
@@ -41,7 +41,15 @@ if hasattr(ida_bytes, "parse_binpat_str"):
|
||||
return
|
||||
|
||||
while True:
|
||||
ea, _ = ida_bytes.bin_search(start, end, patterns, ida_bytes.BIN_SEARCH_FORWARD)
|
||||
ea = ida_bytes.bin_search(start, end, patterns, ida_bytes.BIN_SEARCH_FORWARD)
|
||||
if isinstance(ea, int):
|
||||
# "ea_t" in IDA 8.4, 8.3
|
||||
pass
|
||||
elif isinstance(ea, tuple):
|
||||
# "drc_t" in IDA 9
|
||||
ea = ea[0]
|
||||
else:
|
||||
raise NotImplementedError(f"bin_search returned unhandled type: {type(ea)}")
|
||||
if ea == idaapi.BADADDR:
|
||||
break
|
||||
start = ea + 1
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
|
||||
import re
|
||||
import string
|
||||
import contextlib
|
||||
from collections import namedtuple
|
||||
|
||||
@@ -19,6 +20,7 @@ ASCII_RE_4 = re.compile(b"([%s]{%d,})" % (ASCII_BYTE, 4))
|
||||
UNICODE_RE_4 = re.compile(b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, 4))
|
||||
REPEATS = [b"A", b"\x00", b"\xfe", b"\xff"]
|
||||
SLICE_SIZE = 4096
|
||||
PRINTABLE_CHAR_SET = set(string.printable)
|
||||
|
||||
String = namedtuple("String", ["s", "offset"])
|
||||
|
||||
@@ -84,3 +86,7 @@ def extract_unicode_strings(buf, n=4):
|
||||
for match in r.finditer(buf):
|
||||
with contextlib.suppress(UnicodeDecodeError):
|
||||
yield String(match.group().decode("utf-16"), match.start())
|
||||
|
||||
|
||||
def is_printable_str(s: str) -> bool:
|
||||
return set(s).issubset(PRINTABLE_CHAR_SET)
|
||||
|
||||
@@ -34,7 +34,10 @@ class VMRayMonitorProcess:
|
||||
pid: int # process ID assigned by OS
|
||||
ppid: int # parent process ID assigned by OS
|
||||
monitor_id: int # unique ID assigned to process by VMRay
|
||||
origin_monitor_id: int # unique VMRay ID of parent process
|
||||
image_name: str
|
||||
filename: Optional[str] = ""
|
||||
cmd_line: Optional[str] = ""
|
||||
|
||||
|
||||
class VMRayAnalysis:
|
||||
@@ -160,7 +163,13 @@ class VMRayAnalysis:
|
||||
self.sv2.processes[process.ref_parent_process.path[1]].os_pid if process.ref_parent_process else 0
|
||||
)
|
||||
self.monitor_processes[process.monitor_id] = VMRayMonitorProcess(
|
||||
process.os_pid, ppid, process.monitor_id, process.image_name
|
||||
process.os_pid,
|
||||
ppid,
|
||||
process.monitor_id,
|
||||
process.origin_monitor_id,
|
||||
process.image_name,
|
||||
process.filename,
|
||||
process.cmd_line,
|
||||
)
|
||||
|
||||
# not all processes are recorded in SummaryV2.json, get missing data from flog.xml, see #2394
|
||||
@@ -169,14 +178,28 @@ class VMRayAnalysis:
|
||||
monitor_process.os_pid,
|
||||
monitor_process.os_parent_pid,
|
||||
monitor_process.process_id,
|
||||
monitor_process.parent_id,
|
||||
monitor_process.image_name,
|
||||
monitor_process.filename,
|
||||
monitor_process.cmd_line,
|
||||
)
|
||||
|
||||
if monitor_process.process_id not in self.monitor_processes:
|
||||
self.monitor_processes[monitor_process.process_id] = vmray_monitor_process
|
||||
else:
|
||||
# we expect monitor processes recorded in both SummaryV2.json and flog.xml to equal
|
||||
assert self.monitor_processes[monitor_process.process_id] == vmray_monitor_process
|
||||
# to ensure this, we compare the pid, monitor_id, and origin_monitor_id
|
||||
# for the other fields we've observed cases with slight deviations, e.g.,
|
||||
# the ppid for a process in flog.xml is not set correctly, all other data is equal
|
||||
sv2p = self.monitor_processes[monitor_process.process_id]
|
||||
if self.monitor_processes[monitor_process.process_id] != vmray_monitor_process:
|
||||
logger.debug("processes differ: %s (sv2) vs. %s (flog)", sv2p, vmray_monitor_process)
|
||||
|
||||
assert (sv2p.pid, sv2p.monitor_id, sv2p.origin_monitor_id) == (
|
||||
vmray_monitor_process.pid,
|
||||
vmray_monitor_process.monitor_id,
|
||||
vmray_monitor_process.origin_monitor_id,
|
||||
)
|
||||
|
||||
def _compute_monitor_threads(self):
|
||||
for monitor_thread in self.flog.analysis.monitor_threads:
|
||||
|
||||
@@ -12,6 +12,7 @@ import capa.features.extractors.helpers
|
||||
from capa.features.insn import API, Number
|
||||
from capa.features.common import String, Feature
|
||||
from capa.features.address import Address
|
||||
from capa.features.extractors.strings import is_printable_str
|
||||
from capa.features.extractors.vmray.models import PARAM_TYPE_INT, PARAM_TYPE_STR, Param, FunctionCall, hexint
|
||||
from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle
|
||||
|
||||
@@ -27,11 +28,9 @@ def get_call_param_features(param: Param, ch: CallHandle) -> Iterator[tuple[Feat
|
||||
if param.deref.type_ in PARAM_TYPE_INT:
|
||||
yield Number(hexint(param.deref.value)), ch.address
|
||||
elif param.deref.type_ in PARAM_TYPE_STR:
|
||||
# TODO(mr-tz): remove FPS like " \\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\x09\\x0a\\x0b\\x0c\\x0d\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\..."
|
||||
# https://github.com/mandiant/capa/issues/2432
|
||||
|
||||
# parsing the data up to here results in double-escaped backslashes, remove those here
|
||||
yield String(param.deref.value.replace("\\\\", "\\")), ch.address
|
||||
if is_printable_str(param.deref.value):
|
||||
# parsing the data up to here results in double-escaped backslashes, remove those here
|
||||
yield String(param.deref.value.replace("\\\\", "\\")), ch.address
|
||||
else:
|
||||
logger.debug("skipping deref param type %s", param.deref.type_)
|
||||
elif param.value is not None:
|
||||
|
||||
@@ -86,7 +86,7 @@ class VMRayExtractor(DynamicFeatureExtractor):
|
||||
|
||||
def get_process_name(self, ph) -> str:
|
||||
monitor_process: VMRayMonitorProcess = ph.inner
|
||||
return monitor_process.image_name
|
||||
return f"{monitor_process.image_name} ({monitor_process.cmd_line})"
|
||||
|
||||
def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
|
||||
for monitor_thread_id in self.analysis.monitor_threads_by_monitor_process[ph.inner.monitor_id]:
|
||||
|
||||
@@ -136,11 +136,20 @@ class FunctionReturn(BaseModel):
|
||||
from_addr: HexInt = Field(alias="from")
|
||||
|
||||
|
||||
def sanitize_string(value: str) -> str:
|
||||
# e.g. "cmd_line": "\"C:\\Users\\38lTTV5Kii\\Desktop\\filename.exe\" ",
|
||||
return value.replace("\\\\", "\\").strip(' "')
|
||||
|
||||
|
||||
# unify representation
|
||||
SanitizedString = Annotated[str, BeforeValidator(sanitize_string)]
|
||||
|
||||
|
||||
class MonitorProcess(BaseModel):
|
||||
ts: HexInt
|
||||
process_id: int
|
||||
image_name: str
|
||||
filename: str
|
||||
filename: SanitizedString
|
||||
# page_root: HexInt
|
||||
os_pid: HexInt
|
||||
# os_integrity_level: HexInt
|
||||
@@ -148,7 +157,7 @@ class MonitorProcess(BaseModel):
|
||||
monitor_reason: str
|
||||
parent_id: int
|
||||
os_parent_pid: HexInt
|
||||
# cmd_line: str
|
||||
cmd_line: SanitizedString
|
||||
# cur_dir: str
|
||||
# os_username: str
|
||||
# bitness: int
|
||||
@@ -267,7 +276,7 @@ class ElfFileHeader(BaseModel):
|
||||
|
||||
class ElfFile(BaseModel):
|
||||
# file_header: ElfFileHeader
|
||||
sections: list[ElfFileSection]
|
||||
sections: list[ElfFileSection] = []
|
||||
|
||||
|
||||
class StaticData(BaseModel):
|
||||
@@ -305,9 +314,11 @@ class Process(BaseModel):
|
||||
# is_ioc: bool
|
||||
monitor_id: int
|
||||
# monitor_reason: str
|
||||
origin_monitor_id: int # VMRay ID of parent process
|
||||
os_pid: int
|
||||
filename: str
|
||||
filename: Optional[SanitizedString] = ""
|
||||
image_name: str
|
||||
cmd_line: Optional[SanitizedString] = ""
|
||||
ref_parent_process: Optional[GenericReference] = None
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
import binascii
|
||||
from typing import Union, Optional
|
||||
from typing import Union, Literal, Optional, Annotated
|
||||
|
||||
from pydantic import Field, BaseModel, ConfigDict
|
||||
|
||||
@@ -209,168 +209,171 @@ def feature_from_capa(f: capa.features.common.Feature) -> "Feature":
|
||||
|
||||
|
||||
class OSFeature(FeatureModel):
|
||||
type: str = "os"
|
||||
type: Literal["os"] = "os"
|
||||
os: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class ArchFeature(FeatureModel):
|
||||
type: str = "arch"
|
||||
type: Literal["arch"] = "arch"
|
||||
arch: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class FormatFeature(FeatureModel):
|
||||
type: str = "format"
|
||||
type: Literal["format"] = "format"
|
||||
format: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class MatchFeature(FeatureModel):
|
||||
type: str = "match"
|
||||
type: Literal["match"] = "match"
|
||||
match: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class CharacteristicFeature(FeatureModel):
|
||||
type: str = "characteristic"
|
||||
type: Literal["characteristic"] = "characteristic"
|
||||
characteristic: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class ExportFeature(FeatureModel):
|
||||
type: str = "export"
|
||||
type: Literal["export"] = "export"
|
||||
export: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class ImportFeature(FeatureModel):
|
||||
type: str = "import"
|
||||
type: Literal["import"] = "import"
|
||||
import_: str = Field(alias="import")
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class SectionFeature(FeatureModel):
|
||||
type: str = "section"
|
||||
type: Literal["section"] = "section"
|
||||
section: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class FunctionNameFeature(FeatureModel):
|
||||
type: str = "function name"
|
||||
type: Literal["function name"] = "function name"
|
||||
function_name: str = Field(alias="function name")
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class SubstringFeature(FeatureModel):
|
||||
type: str = "substring"
|
||||
type: Literal["substring"] = "substring"
|
||||
substring: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class RegexFeature(FeatureModel):
|
||||
type: str = "regex"
|
||||
type: Literal["regex"] = "regex"
|
||||
regex: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class StringFeature(FeatureModel):
|
||||
type: str = "string"
|
||||
type: Literal["string"] = "string"
|
||||
string: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class ClassFeature(FeatureModel):
|
||||
type: str = "class"
|
||||
type: Literal["class"] = "class"
|
||||
class_: str = Field(alias="class")
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class NamespaceFeature(FeatureModel):
|
||||
type: str = "namespace"
|
||||
type: Literal["namespace"] = "namespace"
|
||||
namespace: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class BasicBlockFeature(FeatureModel):
|
||||
type: str = "basic block"
|
||||
type: Literal["basic block"] = "basic block"
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class APIFeature(FeatureModel):
|
||||
type: str = "api"
|
||||
type: Literal["api"] = "api"
|
||||
api: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class PropertyFeature(FeatureModel):
|
||||
type: str = "property"
|
||||
type: Literal["property"] = "property"
|
||||
access: Optional[str] = None
|
||||
property: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class NumberFeature(FeatureModel):
|
||||
type: str = "number"
|
||||
type: Literal["number"] = "number"
|
||||
number: Union[int, float]
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class BytesFeature(FeatureModel):
|
||||
type: str = "bytes"
|
||||
type: Literal["bytes"] = "bytes"
|
||||
bytes: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class OffsetFeature(FeatureModel):
|
||||
type: str = "offset"
|
||||
type: Literal["offset"] = "offset"
|
||||
offset: int
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class MnemonicFeature(FeatureModel):
|
||||
type: str = "mnemonic"
|
||||
type: Literal["mnemonic"] = "mnemonic"
|
||||
mnemonic: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class OperandNumberFeature(FeatureModel):
|
||||
type: str = "operand number"
|
||||
type: Literal["operand number"] = "operand number"
|
||||
index: int
|
||||
operand_number: int = Field(alias="operand number")
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class OperandOffsetFeature(FeatureModel):
|
||||
type: str = "operand offset"
|
||||
type: Literal["operand offset"] = "operand offset"
|
||||
index: int
|
||||
operand_offset: int = Field(alias="operand offset")
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
Feature = Union[
|
||||
OSFeature,
|
||||
ArchFeature,
|
||||
FormatFeature,
|
||||
MatchFeature,
|
||||
CharacteristicFeature,
|
||||
ExportFeature,
|
||||
ImportFeature,
|
||||
SectionFeature,
|
||||
FunctionNameFeature,
|
||||
SubstringFeature,
|
||||
RegexFeature,
|
||||
StringFeature,
|
||||
ClassFeature,
|
||||
NamespaceFeature,
|
||||
APIFeature,
|
||||
PropertyFeature,
|
||||
NumberFeature,
|
||||
BytesFeature,
|
||||
OffsetFeature,
|
||||
MnemonicFeature,
|
||||
OperandNumberFeature,
|
||||
OperandOffsetFeature,
|
||||
# Note! this must be last, see #1161
|
||||
BasicBlockFeature,
|
||||
Feature = Annotated[
|
||||
Union[
|
||||
OSFeature,
|
||||
ArchFeature,
|
||||
FormatFeature,
|
||||
MatchFeature,
|
||||
CharacteristicFeature,
|
||||
ExportFeature,
|
||||
ImportFeature,
|
||||
SectionFeature,
|
||||
FunctionNameFeature,
|
||||
SubstringFeature,
|
||||
RegexFeature,
|
||||
StringFeature,
|
||||
ClassFeature,
|
||||
NamespaceFeature,
|
||||
APIFeature,
|
||||
PropertyFeature,
|
||||
NumberFeature,
|
||||
BytesFeature,
|
||||
OffsetFeature,
|
||||
MnemonicFeature,
|
||||
OperandNumberFeature,
|
||||
OperandOffsetFeature,
|
||||
# Note! this must be last, see #1161
|
||||
BasicBlockFeature,
|
||||
],
|
||||
Field(discriminator="type"),
|
||||
]
|
||||
|
||||
@@ -46,6 +46,7 @@ from capa.features.common import (
|
||||
FORMAT_FREEZE,
|
||||
FORMAT_DRAKVUF,
|
||||
FORMAT_UNKNOWN,
|
||||
FORMAT_BINJA_DB,
|
||||
FORMAT_BINEXPORT2,
|
||||
Format,
|
||||
)
|
||||
@@ -59,6 +60,7 @@ EXTENSIONS_DYNAMIC = ("json", "json_", "json.gz", "log", ".log.gz", ".zip")
|
||||
EXTENSIONS_BINEXPORT2 = ("BinExport", "BinExport2")
|
||||
EXTENSIONS_ELF = "elf_"
|
||||
EXTENSIONS_FREEZE = "frz"
|
||||
EXTENSIONS_BINJA_DB = "bndb"
|
||||
|
||||
logger = logging.getLogger("capa")
|
||||
|
||||
@@ -232,6 +234,8 @@ def get_format_from_extension(sample: Path) -> str:
|
||||
format_ = FORMAT_FREEZE
|
||||
elif sample.name.endswith(EXTENSIONS_BINEXPORT2):
|
||||
format_ = FORMAT_BINEXPORT2
|
||||
elif sample.name.endswith(EXTENSIONS_BINJA_DB):
|
||||
format_ = FORMAT_BINJA_DB
|
||||
return format_
|
||||
|
||||
|
||||
|
||||
@@ -271,7 +271,12 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
|
||||
@param checked: True, item checked, False item not checked
|
||||
"""
|
||||
if not isinstance(
|
||||
item, (CapaExplorerStringViewItem, CapaExplorerInstructionViewItem, CapaExplorerByteViewItem)
|
||||
item,
|
||||
(
|
||||
CapaExplorerStringViewItem,
|
||||
CapaExplorerInstructionViewItem,
|
||||
CapaExplorerByteViewItem,
|
||||
),
|
||||
):
|
||||
# ignore other item types
|
||||
return
|
||||
@@ -433,11 +438,19 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
|
||||
|
||||
if isinstance(match.node, rd.StatementNode):
|
||||
parent2 = self.render_capa_doc_statement_node(
|
||||
parent, match, match.node.statement, [addr.to_capa() for addr in match.locations], doc
|
||||
parent,
|
||||
match,
|
||||
match.node.statement,
|
||||
[addr.to_capa() for addr in match.locations],
|
||||
doc,
|
||||
)
|
||||
elif isinstance(match.node, rd.FeatureNode):
|
||||
parent2 = self.render_capa_doc_feature_node(
|
||||
parent, match, match.node.feature, [addr.to_capa() for addr in match.locations], doc
|
||||
parent,
|
||||
match,
|
||||
match.node.feature,
|
||||
[addr.to_capa() for addr in match.locations],
|
||||
doc,
|
||||
)
|
||||
else:
|
||||
raise RuntimeError("unexpected node type: " + str(match.node.type))
|
||||
@@ -494,7 +507,13 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
|
||||
for rule in rutils.capability_rules(doc):
|
||||
rule_name = rule.meta.name
|
||||
rule_namespace = rule.meta.namespace or ""
|
||||
parent = CapaExplorerRuleItem(self.root_node, rule_name, rule_namespace, len(rule.matches), rule.source)
|
||||
parent = CapaExplorerRuleItem(
|
||||
self.root_node,
|
||||
rule_name,
|
||||
rule_namespace,
|
||||
len(rule.matches),
|
||||
rule.source,
|
||||
)
|
||||
|
||||
for location_, match in rule.matches:
|
||||
location = location_.to_capa()
|
||||
@@ -529,12 +548,12 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
|
||||
# inform model changes have ended
|
||||
self.endResetModel()
|
||||
|
||||
def capa_doc_feature_to_display(self, feature: frzf.Feature):
|
||||
def capa_doc_feature_to_display(self, feature: frzf.Feature) -> str:
|
||||
"""convert capa doc feature type string to display string for ui
|
||||
|
||||
@param feature: capa feature read from doc
|
||||
"""
|
||||
key = feature.type
|
||||
key = str(feature.type)
|
||||
value = feature.dict(by_alias=True).get(feature.type)
|
||||
|
||||
if value:
|
||||
@@ -640,7 +659,10 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
|
||||
assert isinstance(addr, frz.Address)
|
||||
if location == addr.value:
|
||||
return CapaExplorerStringViewItem(
|
||||
parent, display, location, '"' + capa.features.common.escape_string(capture) + '"'
|
||||
parent,
|
||||
display,
|
||||
location,
|
||||
'"' + capa.features.common.escape_string(capture) + '"',
|
||||
)
|
||||
|
||||
# programming error: the given location should always be found in the regex matches
|
||||
@@ -671,7 +693,10 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
|
||||
elif isinstance(feature, frzf.StringFeature):
|
||||
# display string preview
|
||||
return CapaExplorerStringViewItem(
|
||||
parent, display, location, f'"{capa.features.common.escape_string(feature.string)}"'
|
||||
parent,
|
||||
display,
|
||||
location,
|
||||
f'"{capa.features.common.escape_string(feature.string)}"',
|
||||
)
|
||||
|
||||
elif isinstance(
|
||||
@@ -713,7 +738,11 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
|
||||
|
||||
# recursive search for all instances of old function name
|
||||
for model_index in self.match(
|
||||
root_index, QtCore.Qt.DisplayRole, old_name, hits=-1, flags=QtCore.Qt.MatchRecursive
|
||||
root_index,
|
||||
QtCore.Qt.DisplayRole,
|
||||
old_name,
|
||||
hits=-1,
|
||||
flags=QtCore.Qt.MatchRecursive,
|
||||
):
|
||||
if not isinstance(model_index.internalPointer(), CapaExplorerFunctionItem):
|
||||
continue
|
||||
|
||||
@@ -48,6 +48,7 @@ from capa.features.common import (
|
||||
FORMAT_VMRAY,
|
||||
FORMAT_DOTNET,
|
||||
FORMAT_DRAKVUF,
|
||||
FORMAT_BINJA_DB,
|
||||
FORMAT_BINEXPORT2,
|
||||
)
|
||||
from capa.features.address import Address
|
||||
@@ -239,7 +240,6 @@ def get_extractor(
|
||||
return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(input_path)
|
||||
|
||||
elif backend == BACKEND_BINJA:
|
||||
import capa.perf as perf
|
||||
import capa.features.extractors.binja.find_binja_api as finder
|
||||
|
||||
if not finder.has_binaryninja():
|
||||
@@ -252,7 +252,7 @@ def get_extractor(
|
||||
|
||||
import capa.features.extractors.binja.extractor
|
||||
|
||||
if input_format not in (FORMAT_SC32, FORMAT_SC64):
|
||||
if input_format not in (FORMAT_SC32, FORMAT_SC64, FORMAT_BINJA_DB):
|
||||
if not is_supported_format(input_path):
|
||||
raise UnsupportedFormatError()
|
||||
|
||||
@@ -263,10 +263,9 @@ def get_extractor(
|
||||
raise UnsupportedOSError()
|
||||
|
||||
with console.status("analyzing program...", spinner="dots"):
|
||||
with perf.timing("binary ninja: loading program"):
|
||||
bv: binaryninja.BinaryView = binaryninja.load(str(input_path))
|
||||
if bv is None:
|
||||
raise RuntimeError(f"Binary Ninja cannot open file {input_path}")
|
||||
bv: binaryninja.BinaryView = binaryninja.load(str(input_path))
|
||||
if bv is None:
|
||||
raise RuntimeError(f"Binary Ninja cannot open file {input_path}")
|
||||
|
||||
return capa.features.extractors.binja.extractor.BinjaFeatureExtractor(bv)
|
||||
|
||||
|
||||
17
capa/main.py
17
capa/main.py
@@ -92,6 +92,7 @@ from capa.features.common import (
|
||||
FORMAT_DRAKVUF,
|
||||
STATIC_FORMATS,
|
||||
DYNAMIC_FORMATS,
|
||||
FORMAT_BINJA_DB,
|
||||
FORMAT_BINEXPORT2,
|
||||
)
|
||||
from capa.capabilities.common import find_capabilities, has_file_limitation, find_file_capabilities
|
||||
@@ -197,7 +198,7 @@ def simple_message_exception_handler(
|
||||
else:
|
||||
print(
|
||||
f"Unexpected exception raised: {exctype}. Please run capa in debug mode (-d/--debug) "
|
||||
+ "to see the stack trace. Please also report your issue on the capa GitHub page so we "
|
||||
+ "to see the stack trace.\nPlease also report your issue on the capa GitHub page so we "
|
||||
+ "can improve the code! (https://github.com/mandiant/capa/issues)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
@@ -266,6 +267,7 @@ def install_common_args(parser, wanted=None):
|
||||
(FORMAT_VMRAY, "VMRay sandbox report"),
|
||||
(FORMAT_FREEZE, "features previously frozen by capa"),
|
||||
(FORMAT_BINEXPORT2, "BinExport2"),
|
||||
(FORMAT_BINJA_DB, "Binary Ninja Database"),
|
||||
]
|
||||
format_help = ", ".join([f"{f[0]}: {f[1]}" for f in formats])
|
||||
|
||||
@@ -746,15 +748,13 @@ def find_file_limitations_from_cli(args, rules: RuleSet, file_extractors: list[F
|
||||
args:
|
||||
args: The parsed command line arguments from `install_common_args`.
|
||||
|
||||
Dynamic feature extractors can handle packed samples and do not need to be considered here.
|
||||
|
||||
raises:
|
||||
ShouldExitError: if the program is invoked incorrectly and should exit.
|
||||
"""
|
||||
found_file_limitation = False
|
||||
for file_extractor in file_extractors:
|
||||
if isinstance(file_extractor, DynamicFeatureExtractor):
|
||||
# Dynamic feature extractors can handle packed samples
|
||||
continue
|
||||
|
||||
try:
|
||||
pure_file_capabilities, _ = find_file_capabilities(rules, file_extractor, {})
|
||||
except PEFormatError as e:
|
||||
@@ -960,8 +960,11 @@ def main(argv: Optional[list[str]] = None):
|
||||
ensure_input_exists_from_cli(args)
|
||||
input_format = get_input_format_from_cli(args)
|
||||
rules = get_rules_from_cli(args)
|
||||
file_extractors = get_file_extractors_from_cli(args, input_format)
|
||||
found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors)
|
||||
found_file_limitation = False
|
||||
if input_format in STATIC_FORMATS:
|
||||
# only static extractors have file limitations
|
||||
file_extractors = get_file_extractors_from_cli(args, input_format)
|
||||
found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors)
|
||||
except ShouldExitError as e:
|
||||
return e.status_code
|
||||
|
||||
|
||||
20
capa/perf.py
20
capa/perf.py
@@ -5,9 +5,6 @@
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
import time
|
||||
import inspect
|
||||
import contextlib
|
||||
import collections
|
||||
|
||||
# this structure is unstable and may change before the next major release.
|
||||
@@ -17,20 +14,3 @@ counters: collections.Counter[str] = collections.Counter()
|
||||
def reset():
|
||||
global counters
|
||||
counters = collections.Counter()
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def timing(msg: str):
|
||||
"""log the given message start/stop and time taken, using the caller's `logger` instance."""
|
||||
# stack:
|
||||
# 0: here
|
||||
# 1: contextlib
|
||||
# 2: caller
|
||||
caller = inspect.stack()[2]
|
||||
caller_logger = caller.frame.f_globals.get("logger")
|
||||
|
||||
caller_logger.debug("%s...", msg)
|
||||
t0 = time.time()
|
||||
yield
|
||||
t1 = time.time()
|
||||
caller_logger.debug("%s done in %0.1fs.", msg, t1 - t0)
|
||||
|
||||
@@ -168,7 +168,7 @@ def render_feature(
|
||||
):
|
||||
console.write(" " * indent)
|
||||
|
||||
key = feature.type
|
||||
key = str(feature.type)
|
||||
value: Optional[str]
|
||||
if isinstance(feature, frzf.BasicBlockFeature):
|
||||
# i don't think it makes sense to have standalone basic block features.
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
__version__ = "7.4.0"
|
||||
__version__ = "8.0.1"
|
||||
|
||||
|
||||
def get_major_version():
|
||||
|
||||
@@ -38,12 +38,12 @@
|
||||
```
|
||||
- [ ] Update [capa/version.py](https://github.com/mandiant/capa/blob/master/capa/version.py)
|
||||
- [ ] Create a PR with the updated [CHANGELOG.md](https://github.com/mandiant/capa/blob/master/CHANGELOG.md) and [capa/version.py](https://github.com/mandiant/capa/blob/master/capa/version.py). Copy this checklist in the PR description.
|
||||
- [ ] Update the [homepage](https://github.com/mandiant/capa/blob/master/web/public/index.html) (i.e. What's New section)
|
||||
- [ ] After PR review, merge the PR and [create the release in GH](https://github.com/mandiant/capa/releases/new) using text from the [CHANGELOG.md](https://github.com/mandiant/capa/blob/master/CHANGELOG.md).
|
||||
- Verify GH actions
|
||||
- [ ] [upload artifacts](https://github.com/mandiant/capa/releases)
|
||||
- [ ] [publish to PyPI](https://pypi.org/project/flare-capa)
|
||||
- [ ] [create tag in capa rules](https://github.com/mandiant/capa-rules/tags)
|
||||
- [ ] [create release in capa rules](https://github.com/mandiant/capa-rules/releases)
|
||||
- [ ] Update [homepage](https://github.com/mandiant/capa/blob/master/web/public/index.html)
|
||||
- [ ] [Spread the word](https://twitter.com)
|
||||
- [ ] Update internal service
|
||||
|
||||
@@ -77,6 +77,8 @@ dependencies = [
|
||||
"protobuf>=5",
|
||||
"msgspec>=0.18.6",
|
||||
"xmltodict>=0.13.0",
|
||||
# for library detection (in development)
|
||||
"nltk>=3",
|
||||
|
||||
# ---------------------------------------
|
||||
# Dependencies that we develop
|
||||
@@ -120,7 +122,7 @@ dev = [
|
||||
"pytest-instafail==0.5.0",
|
||||
"pytest-cov==6.0.0",
|
||||
"flake8==7.1.1",
|
||||
"flake8-bugbear==24.10.31",
|
||||
"flake8-bugbear==24.12.12",
|
||||
"flake8-encodings==0.5.1",
|
||||
"flake8-comprehensions==3.16.0",
|
||||
"flake8-logging-format==0.9.0",
|
||||
|
||||
@@ -22,7 +22,7 @@ msgpack==1.0.8
|
||||
networkx==3.4.2
|
||||
pefile==2024.8.26
|
||||
pip==24.3.1
|
||||
protobuf==5.28.2
|
||||
protobuf==5.29.0
|
||||
pyasn1==0.5.1
|
||||
pyasn1-modules==0.3.0
|
||||
pycparser==2.22
|
||||
|
||||
2
rules
2
rules
Submodule rules updated: f3bb093ab0...ff9db74425
970
scripts/codecut.py
Normal file
970
scripts/codecut.py
Normal file
@@ -0,0 +1,970 @@
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import argparse
|
||||
import subprocess
|
||||
from typing import Iterator, Optional, Literal
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing import Pool
|
||||
|
||||
import pefile
|
||||
import lancelot
|
||||
import networkx as nx
|
||||
import lancelot.be2utils
|
||||
from lancelot.be2utils import AddressSpace, BinExport2Index, ReadMemoryError
|
||||
from lancelot.be2utils.binexport2_pb2 import BinExport2
|
||||
|
||||
import capa.main
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGraph.Vertex.Type.ValueType) -> bool:
|
||||
return vertex.HasField("type") and vertex.type == type_
|
||||
|
||||
|
||||
def is_vertex_thunk(vertex: BinExport2.CallGraph.Vertex) -> bool:
|
||||
return is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.THUNK)
|
||||
|
||||
|
||||
THUNK_CHAIN_DEPTH_DELTA = 5
|
||||
|
||||
|
||||
def compute_thunks(be2: BinExport2, idx: BinExport2Index) -> dict[int, int]:
|
||||
# from thunk address to target function address
|
||||
thunks: dict[int, int] = {}
|
||||
|
||||
for addr, vertex_idx in idx.vertex_index_by_address.items():
|
||||
vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_idx]
|
||||
if not is_vertex_thunk(vertex):
|
||||
continue
|
||||
|
||||
curr_vertex_idx: int = vertex_idx
|
||||
for _ in range(THUNK_CHAIN_DEPTH_DELTA):
|
||||
thunk_callees: list[int] = idx.callees_by_vertex_index[curr_vertex_idx]
|
||||
# if this doesn't hold, then it doesn't seem like this is a thunk,
|
||||
# because either, len is:
|
||||
# 0 and the thunk doesn't point to anything, such as `jmp eax`, or
|
||||
# >1 and the thunk may end up at many functions.
|
||||
|
||||
if not thunk_callees:
|
||||
# maybe we have an indirect jump, like `jmp eax`
|
||||
# that we can't actually resolve here.
|
||||
break
|
||||
|
||||
if len(thunk_callees) != 1:
|
||||
for thunk_callee in thunk_callees:
|
||||
logger.warning("%s", hex(be2.call_graph.vertex[thunk_callee].address))
|
||||
assert len(thunk_callees) == 1, f"thunk @ {hex(addr)} failed"
|
||||
|
||||
thunked_vertex_idx: int = thunk_callees[0]
|
||||
thunked_vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[thunked_vertex_idx]
|
||||
|
||||
if not is_vertex_thunk(thunked_vertex):
|
||||
assert thunked_vertex.HasField("address")
|
||||
|
||||
thunks[addr] = thunked_vertex.address
|
||||
break
|
||||
|
||||
curr_vertex_idx = thunked_vertex_idx
|
||||
|
||||
return thunks
|
||||
|
||||
|
||||
def read_string(address_space: AddressSpace, address: int) -> Optional[str]:
|
||||
try:
|
||||
# if at end of segment then there might be an overrun here.
|
||||
buf: bytes = address_space.read_memory(address, 0x100)
|
||||
|
||||
except ReadMemoryError:
|
||||
logger.debug("failed to read memory: 0x%x", address)
|
||||
return None
|
||||
|
||||
# note: we *always* break after the first iteration
|
||||
for s in capa.features.extractors.strings.extract_ascii_strings(buf):
|
||||
if s.offset != 0:
|
||||
break
|
||||
|
||||
return s.s
|
||||
|
||||
# note: we *always* break after the first iteration
|
||||
for s in capa.features.extractors.strings.extract_unicode_strings(buf):
|
||||
if s.offset != 0:
|
||||
break
|
||||
|
||||
return s.s
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@dataclass
|
||||
class AssemblageRow:
|
||||
# from table: binaries
|
||||
binary_id: int
|
||||
file_name: str
|
||||
platform: str
|
||||
build_mode: str
|
||||
toolset_version: str
|
||||
github_url: str
|
||||
optimization: str
|
||||
repo_last_update: int
|
||||
size: int
|
||||
path: str
|
||||
license: str
|
||||
binary_hash: str
|
||||
repo_commit_hash: str
|
||||
# from table: functions
|
||||
function_id: int
|
||||
function_name: str
|
||||
function_hash: str
|
||||
top_comments: str
|
||||
source_codes: str
|
||||
prototype: str
|
||||
_source_file: str
|
||||
# from table: rvas
|
||||
rva_id: int
|
||||
start_rva: int
|
||||
end_rva: int
|
||||
|
||||
@property
|
||||
def source_file(self):
|
||||
# cleanup some extra metadata provided by assemblage
|
||||
return self._source_file.partition(" (MD5: ")[0].partition(" (0x3: ")[0]
|
||||
|
||||
|
||||
class Assemblage:
|
||||
conn: sqlite3.Connection
|
||||
samples: Path
|
||||
|
||||
def __init__(self, db: Path, samples: Path):
|
||||
super().__init__()
|
||||
|
||||
self.db = db
|
||||
self.samples = samples
|
||||
|
||||
self.conn = sqlite3.connect(self.db)
|
||||
with self.conn:
|
||||
self.conn.executescript(
|
||||
"""
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA synchronous = NORMAL;
|
||||
PRAGMA busy_timeout = 5000;
|
||||
PRAGMA cache_size = -20000; -- 20MB
|
||||
PRAGMA foreign_keys = true;
|
||||
PRAGMA temp_store = memory;
|
||||
|
||||
BEGIN IMMEDIATE TRANSACTION;
|
||||
CREATE INDEX IF NOT EXISTS idx__functions__binary_id ON functions (binary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx__rvas__function_id ON rvas (function_id);
|
||||
|
||||
CREATE VIEW IF NOT EXISTS assemblage AS
|
||||
SELECT
|
||||
binaries.id AS binary_id,
|
||||
binaries.file_name AS file_name,
|
||||
binaries.platform AS platform,
|
||||
binaries.build_mode AS build_mode,
|
||||
binaries.toolset_version AS toolset_version,
|
||||
binaries.github_url AS github_url,
|
||||
binaries.optimization AS optimization,
|
||||
binaries.repo_last_update AS repo_last_update,
|
||||
binaries.size AS size,
|
||||
binaries.path AS path,
|
||||
binaries.license AS license,
|
||||
binaries.hash AS hash,
|
||||
binaries.repo_commit_hash AS repo_commit_hash,
|
||||
|
||||
functions.id AS function_id,
|
||||
functions.name AS function_name,
|
||||
functions.hash AS function_hash,
|
||||
functions.top_comments AS top_comments,
|
||||
functions.source_codes AS source_codes,
|
||||
functions.prototype AS prototype,
|
||||
functions.source_file AS source_file,
|
||||
|
||||
rvas.id AS rva_id,
|
||||
rvas.start AS start_rva,
|
||||
rvas.end AS end_rva
|
||||
FROM binaries
|
||||
JOIN functions ON binaries.id = functions.binary_id
|
||||
JOIN rvas ON functions.id = rvas.function_id;
|
||||
"""
|
||||
)
|
||||
|
||||
def get_row_by_binary_id(self, binary_id: int) -> AssemblageRow:
|
||||
with self.conn:
|
||||
cur = self.conn.execute("SELECT * FROM assemblage WHERE binary_id = ? LIMIT 1;", (binary_id,))
|
||||
return AssemblageRow(*cur.fetchone())
|
||||
|
||||
def get_rows_by_binary_id(self, binary_id: int) -> Iterator[AssemblageRow]:
|
||||
with self.conn:
|
||||
cur = self.conn.execute("SELECT * FROM assemblage WHERE binary_id = ?;", (binary_id,))
|
||||
row = cur.fetchone()
|
||||
while row:
|
||||
yield AssemblageRow(*row)
|
||||
row = cur.fetchone()
|
||||
|
||||
def get_path_by_binary_id(self, binary_id: int) -> Path:
|
||||
with self.conn:
|
||||
cur = self.conn.execute("""SELECT path FROM assemblage WHERE binary_id = ? LIMIT 1""", (binary_id,))
|
||||
return self.samples / cur.fetchone()[0]
|
||||
|
||||
def get_pe_by_binary_id(self, binary_id: int) -> pefile.PE:
|
||||
path = self.get_path_by_binary_id(binary_id)
|
||||
return pefile.PE(data=path.read_bytes(), fast_load=True)
|
||||
|
||||
def get_binary_ids(self) -> Iterator[int]:
|
||||
with self.conn:
|
||||
cur = self.conn.execute("SELECT DISTINCT binary_id FROM assemblage ORDER BY binary_id ASC;")
|
||||
row = cur.fetchone()
|
||||
while row:
|
||||
yield row[0]
|
||||
row = cur.fetchone()
|
||||
|
||||
|
||||
def generate_main(args: argparse.Namespace) -> int:
|
||||
if not args.assemblage_database.is_file():
|
||||
raise ValueError("database doesn't exist")
|
||||
|
||||
db = Assemblage(args.assemblage_database, args.assemblage_directory)
|
||||
|
||||
pe = db.get_pe_by_binary_id(args.binary_id)
|
||||
base_address: int = pe.OPTIONAL_HEADER.ImageBase
|
||||
|
||||
functions_by_address = {
|
||||
base_address + function.start_rva: function for function in db.get_rows_by_binary_id(args.binary_id)
|
||||
}
|
||||
|
||||
hash = db.get_row_by_binary_id(args.binary_id).binary_hash
|
||||
|
||||
def make_node_id(address: int) -> str:
|
||||
return f"{hash}:{address:x}"
|
||||
|
||||
pe_path = db.get_path_by_binary_id(args.binary_id)
|
||||
be2: BinExport2 = lancelot.get_binexport2_from_bytes(
|
||||
pe_path.read_bytes(), function_hints=list(functions_by_address.keys())
|
||||
)
|
||||
|
||||
idx = lancelot.be2utils.BinExport2Index(be2)
|
||||
address_space = lancelot.be2utils.AddressSpace.from_pe(pe, base_address)
|
||||
thunks = compute_thunks(be2, idx)
|
||||
|
||||
g = nx.MultiDiGraph()
|
||||
|
||||
# ensure all functions from ground truth have an entry
|
||||
for address, function in functions_by_address.items():
|
||||
g.add_node(
|
||||
make_node_id(address),
|
||||
address=address,
|
||||
type="function",
|
||||
)
|
||||
|
||||
for flow_graph in be2.flow_graph:
|
||||
datas: set[int] = set()
|
||||
callees: set[int] = set()
|
||||
|
||||
entry_basic_block_index: int = flow_graph.entry_basic_block_index
|
||||
flow_graph_address: int = idx.get_basic_block_address(entry_basic_block_index)
|
||||
|
||||
for basic_block_index in flow_graph.basic_block_index:
|
||||
basic_block: BinExport2.BasicBlock = be2.basic_block[basic_block_index]
|
||||
|
||||
for instruction_index, instruction, _ in idx.basic_block_instructions(basic_block):
|
||||
for addr in instruction.call_target:
|
||||
addr = thunks.get(addr, addr)
|
||||
|
||||
if addr not in idx.vertex_index_by_address:
|
||||
# disassembler did not define function at address
|
||||
logger.debug("0x%x is not a vertex", addr)
|
||||
continue
|
||||
|
||||
vertex_idx: int = idx.vertex_index_by_address[addr]
|
||||
vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_idx]
|
||||
|
||||
callees.add(vertex.address)
|
||||
|
||||
for data_reference_index in idx.data_reference_index_by_source_instruction_index.get(
|
||||
instruction_index, []
|
||||
):
|
||||
data_reference: BinExport2.DataReference = be2.data_reference[data_reference_index]
|
||||
data_reference_address: int = data_reference.address
|
||||
|
||||
if data_reference_address in idx.insn_address_by_index:
|
||||
# appears to be code
|
||||
continue
|
||||
|
||||
datas.add(data_reference_address)
|
||||
|
||||
vertex_index = idx.vertex_index_by_address[flow_graph_address]
|
||||
name = idx.get_function_name_by_vertex(vertex_index)
|
||||
|
||||
g.add_node(
|
||||
make_node_id(flow_graph_address),
|
||||
address=flow_graph_address,
|
||||
type="function",
|
||||
)
|
||||
if datas or callees:
|
||||
logger.info("%s @ 0x%X:", name, flow_graph_address)
|
||||
|
||||
for data_address in sorted(datas):
|
||||
logger.info(" - 0x%X", data_address)
|
||||
# TODO: check if this is already a function
|
||||
g.add_node(
|
||||
make_node_id(data_address),
|
||||
address=data_address,
|
||||
type="data",
|
||||
)
|
||||
g.add_edge(
|
||||
make_node_id(flow_graph_address),
|
||||
make_node_id(data_address),
|
||||
key="reference",
|
||||
)
|
||||
|
||||
for callee in sorted(callees):
|
||||
logger.info(" - %s", idx.get_function_name_by_address(callee))
|
||||
|
||||
g.add_node(
|
||||
make_node_id(callee),
|
||||
address=callee,
|
||||
type="function",
|
||||
)
|
||||
g.add_edge(
|
||||
make_node_id(flow_graph_address),
|
||||
make_node_id(callee),
|
||||
key="call",
|
||||
)
|
||||
|
||||
else:
|
||||
logger.info("%s @ 0x%X: (none)", name, flow_graph_address)
|
||||
|
||||
# set ground truth node attributes from source data
|
||||
for node, attrs in g.nodes(data=True):
|
||||
if attrs["type"] != "function":
|
||||
continue
|
||||
|
||||
if f := functions_by_address.get(attrs["address"]):
|
||||
attrs["name"] = f.function_name
|
||||
attrs["file"] = f.file_name
|
||||
|
||||
for section in pe.sections:
|
||||
# Within each section, emit a neighbor edge for each pair of neighbors.
|
||||
# Neighbors only link nodes of the same type, because assemblage doesn't
|
||||
# have ground truth for data items, so we don't quite know where to split.
|
||||
# Consider this situation:
|
||||
#
|
||||
# moduleA::func1
|
||||
# --- cut ---
|
||||
# moduleB::func1
|
||||
#
|
||||
# that one is ok, but this is hard:
|
||||
#
|
||||
# moduleA::func1
|
||||
# --- cut??? ---
|
||||
# dataZ
|
||||
# --- or cut here??? ---
|
||||
# moduleB::func1
|
||||
#
|
||||
# Does the cut go before or after dataZ?
|
||||
# So, we only have neighbor graphs within functions, and within datas.
|
||||
# For datas, we don't allow interspersed functions.
|
||||
|
||||
section_nodes = sorted(
|
||||
[
|
||||
(node, attrs)
|
||||
for node, attrs in g.nodes(data=True)
|
||||
if (section.VirtualAddress + base_address)
|
||||
<= attrs["address"]
|
||||
< (base_address + section.VirtualAddress + section.Misc_VirtualSize)
|
||||
],
|
||||
key=lambda p: p[1]["address"],
|
||||
)
|
||||
|
||||
# add neighbor edges between data items.
|
||||
# the data items must not be separated by any functions.
|
||||
for i in range(1, len(section_nodes)):
|
||||
a, a_attrs = section_nodes[i - 1]
|
||||
b, b_attrs = section_nodes[i]
|
||||
|
||||
if a_attrs["type"] != "data":
|
||||
continue
|
||||
|
||||
if b_attrs["type"] != "data":
|
||||
continue
|
||||
|
||||
g.add_edge(a, b, key="neighbor")
|
||||
g.add_edge(b, a, key="neighbor")
|
||||
|
||||
section_functions = [
|
||||
(node, attrs)
|
||||
for node, attrs in section_nodes
|
||||
if attrs["type"] == "function"
|
||||
# we only have ground truth for the known functions
|
||||
# so only consider those in the function neighbor graph.
|
||||
and attrs["address"] in functions_by_address
|
||||
]
|
||||
|
||||
# add neighbor edges between functions.
|
||||
# we drop the potentially interspersed data items before computing these edges.
|
||||
for i in range(1, len(section_functions)):
|
||||
a, a_attrs = section_functions[i - 1]
|
||||
b, b_attrs = section_functions[i]
|
||||
is_boundary = a_attrs["file"] == b_attrs["file"]
|
||||
|
||||
# edge attribute: is_source_file_boundary
|
||||
g.add_edge(a, b, key="neighbor", is_source_file_boundary=is_boundary)
|
||||
g.add_edge(b, a, key="neighbor", is_source_file_boundary=is_boundary)
|
||||
|
||||
# rename unknown functions like: sub_401000
|
||||
for n, attrs in g.nodes(data=True):
|
||||
if attrs["type"] != "function":
|
||||
continue
|
||||
|
||||
if "name" in attrs:
|
||||
continue
|
||||
|
||||
attrs["name"] = f"sub_{attrs['address']:x}"
|
||||
|
||||
# assign human-readable repr to add nodes
|
||||
# assign is_import=bool to functions
|
||||
# assign is_string=bool to datas
|
||||
for n, attrs in g.nodes(data=True):
|
||||
match attrs["type"]:
|
||||
case "function":
|
||||
attrs["repr"] = attrs["name"]
|
||||
attrs["is_import"] = "!" in attrs["name"]
|
||||
case "data":
|
||||
if string := read_string(address_space, attrs["address"]):
|
||||
attrs["repr"] = json.dumps(string)
|
||||
attrs["is_string"] = True
|
||||
else:
|
||||
attrs["repr"] = f"data_{attrs['address']:x}"
|
||||
attrs["is_string"] = False
|
||||
|
||||
for line in nx.generate_gexf(g):
|
||||
print(line)
|
||||
|
||||
# db.conn.close()
|
||||
return 0
|
||||
|
||||
|
||||
def _worker(args):
|
||||
|
||||
assemblage_database: Path
|
||||
assemblage_directory: Path
|
||||
graph_file: Path
|
||||
binary_id: int
|
||||
|
||||
(assemblage_database, assemblage_directory, graph_file, binary_id) = args
|
||||
if graph_file.is_file():
|
||||
return
|
||||
|
||||
logger.info("processing: %d", binary_id)
|
||||
process = subprocess.run(
|
||||
["python", __file__, "--debug", "generate", assemblage_database, assemblage_directory, str(binary_id)],
|
||||
capture_output=True,
|
||||
encoding="utf-8",
|
||||
)
|
||||
if process.returncode != 0:
|
||||
logger.warning("failed: %d", binary_id)
|
||||
logger.debug("%s", process.stderr)
|
||||
return
|
||||
|
||||
graph_file.parent.mkdir(exist_ok=True)
|
||||
graph = process.stdout
|
||||
graph_file.write_text(graph)
|
||||
|
||||
|
||||
def generate_all_main(args: argparse.Namespace) -> int:
|
||||
if not args.assemblage_database.is_file():
|
||||
raise ValueError("database doesn't exist")
|
||||
|
||||
db = Assemblage(args.assemblage_database, args.assemblage_directory)
|
||||
|
||||
binary_ids = list(db.get_binary_ids())
|
||||
|
||||
with Pool(args.num_workers) as p:
|
||||
_ = list(
|
||||
p.imap_unordered(
|
||||
_worker,
|
||||
(
|
||||
(
|
||||
args.assemblage_database,
|
||||
args.assemblage_directory,
|
||||
args.output_directory / str(binary_id) / "graph.gexf",
|
||||
binary_id,
|
||||
)
|
||||
for binary_id in binary_ids
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def cluster_main(args: argparse.Namespace) -> int:
|
||||
if not args.graph.is_file():
|
||||
raise ValueError("graph file doesn't exist")
|
||||
|
||||
g = nx.read_gexf(args.graph)
|
||||
|
||||
communities = nx.algorithms.community.louvain_communities(g)
|
||||
for i, community in enumerate(communities):
|
||||
print(f"[{i}]:")
|
||||
for node in community:
|
||||
if "name" in g.nodes[node]:
|
||||
print(f" - {hex(int(node, 0))}: {g.nodes[node]['file']}")
|
||||
else:
|
||||
print(f" - {hex(int(node, 0))}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
# uv pip install torch --index-url https://download.pytorch.org/whl/cpu
|
||||
# uv pip install torch-geometric pandas numpy scikit-learn
|
||||
# import torch # do this on-demand below, because its slow
|
||||
# from torch_geometric.data import HeteroData
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeType:
|
||||
type: str
|
||||
attributes: dict[str, Literal[False] | Literal[""] | Literal[0] | float]
|
||||
|
||||
|
||||
@dataclass
|
||||
class EdgeType:
|
||||
key: str
|
||||
source_type: NodeType
|
||||
destination_type: NodeType
|
||||
attributes: dict[str, Literal[False] | Literal[""] | Literal[0] | float]
|
||||
|
||||
|
||||
NODE_TYPES = {
|
||||
node.type: node
|
||||
for node in [
|
||||
NodeType(
|
||||
type="function",
|
||||
attributes={
|
||||
"is_import": False,
|
||||
"does_reference_string": False,
|
||||
# "ground_truth": False,
|
||||
# unused:
|
||||
# - repr: str
|
||||
# - address: int
|
||||
# - name: str
|
||||
# - file: str
|
||||
},
|
||||
),
|
||||
NodeType(
|
||||
type="data",
|
||||
attributes={
|
||||
"is_string": False,
|
||||
# unused:
|
||||
# - repr: str
|
||||
# - address: int
|
||||
},
|
||||
),
|
||||
]
|
||||
}
|
||||
|
||||
FUNCTION_NODE = NODE_TYPES["function"]
|
||||
DATA_NODE = NODE_TYPES["data"]
|
||||
|
||||
EDGE_TYPES = {
|
||||
(edge.source_type.type, edge.key, edge.destination_type.type): edge
|
||||
for edge in [
|
||||
EdgeType(
|
||||
key="call",
|
||||
source_type=FUNCTION_NODE,
|
||||
destination_type=FUNCTION_NODE,
|
||||
attributes={},
|
||||
),
|
||||
EdgeType(
|
||||
key="reference",
|
||||
source_type=FUNCTION_NODE,
|
||||
destination_type=DATA_NODE,
|
||||
attributes={},
|
||||
),
|
||||
EdgeType(
|
||||
# When functions reference other functions as data,
|
||||
# such as passing a function pointer as a callback.
|
||||
#
|
||||
# Example:
|
||||
# __scrt_set_unhandled_exception_filter > reference > __scrt_unhandled_exception_filter
|
||||
key="reference",
|
||||
source_type=FUNCTION_NODE,
|
||||
destination_type=FUNCTION_NODE,
|
||||
attributes={},
|
||||
),
|
||||
EdgeType(
|
||||
key="neighbor",
|
||||
source_type=FUNCTION_NODE,
|
||||
destination_type=FUNCTION_NODE,
|
||||
attributes={
|
||||
# this is the attribute to predict (ultimately)
|
||||
# "is_source_file_boundary": False,
|
||||
"distance": 1,
|
||||
},
|
||||
),
|
||||
EdgeType(
|
||||
key="neighbor",
|
||||
source_type=DATA_NODE,
|
||||
destination_type=DATA_NODE,
|
||||
# attributes={
|
||||
# },
|
||||
attributes={
|
||||
# this is the attribute to predict (ultimately)
|
||||
# "is_source_file_boundary": False,
|
||||
"distance": 1,
|
||||
},
|
||||
),
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoadedGraph:
|
||||
data: "HeteroData"
|
||||
|
||||
# map from node type to:
|
||||
# map from node id (str) to node index (int), and node index (int) to node id (str).
|
||||
mapping: dict[str, dict[str | int, int | str]]
|
||||
|
||||
|
||||
def load_graph(g: nx.MultiDiGraph) -> LoadedGraph:
|
||||
import torch
|
||||
from torch_geometric.data import HeteroData
|
||||
|
||||
# Our networkx graph identifies nodes by str ("sha256:address").
|
||||
# Torch identifies nodes by index, from 0 to #nodes, for each type of node.
|
||||
# Map one to another.
|
||||
node_indexes_by_node: dict[str, dict[str, int]] = {n: {} for n in NODE_TYPES.keys()}
|
||||
# Because the types are different (str and int),
|
||||
# here's a single mapping where the type of the key implies
|
||||
# the sort of lookup you're doing (by index (int) or by node id (str)).
|
||||
node_mapping: dict[str, dict[str | int, int | str]] = {n: {} for n in NODE_TYPES.keys()}
|
||||
for node_type in NODE_TYPES.keys():
|
||||
def is_this_node_type(node_attrs):
|
||||
node, attrs = node_attrs
|
||||
return attrs["type"] == node_type
|
||||
|
||||
ns = g.nodes(data=True)
|
||||
ns = sorted(ns)
|
||||
ns = filter(is_this_node_type, ns)
|
||||
ns = map(lambda p: p[0], ns)
|
||||
for i, node in enumerate(ns):
|
||||
node_indexes_by_node[node_type][node] = i
|
||||
node_mapping[node_type][node] = i
|
||||
node_mapping[node_type][i] = node
|
||||
|
||||
data = HeteroData()
|
||||
|
||||
for node_type in NODE_TYPES.values():
|
||||
logger.debug("loading nodes: %s", node_type.type)
|
||||
|
||||
node_indexes: list[int] = []
|
||||
attr_values: dict[str, list] = {attribute: [] for attribute in node_type.attributes.keys()}
|
||||
|
||||
for node, attrs in g.nodes(data=True):
|
||||
if attrs["type"] != node_type.type:
|
||||
continue
|
||||
|
||||
node_index = node_indexes_by_node[node_type.type][node]
|
||||
node_indexes.append(node_index)
|
||||
|
||||
for attribute, default_value in node_type.attributes.items():
|
||||
value = attrs.get(attribute, default_value)
|
||||
attr_values[attribute].append(value)
|
||||
|
||||
data[node_type.type].node_id = torch.tensor(node_indexes)
|
||||
if attr_values:
|
||||
# attribute order is implicit in the NODE_TYPES data model above.
|
||||
data[node_type.type].x = torch.stack([torch.tensor(values) for values in attr_values.values()], dim=-1).float()
|
||||
|
||||
for edge_type in EDGE_TYPES.values():
|
||||
logger.debug(
|
||||
"loading edges: %s > %s > %s",
|
||||
edge_type.source_type.type, edge_type.key, edge_type.destination_type.type
|
||||
)
|
||||
|
||||
source_indexes: list[int] = []
|
||||
destination_indexes: list[int] = []
|
||||
attr_values: dict[str, list] = {attribute: [] for attribute in edge_type.attributes.keys()}
|
||||
|
||||
for source, destination, key, attrs in g.edges(data=True, keys=True):
|
||||
if key != edge_type.key:
|
||||
continue
|
||||
if g.nodes[source]["type"] != edge_type.source_type.type:
|
||||
continue
|
||||
if g.nodes[destination]["type"] != edge_type.destination_type.type:
|
||||
continue
|
||||
|
||||
# These are global node indexes
|
||||
# but we need to provide the node type-local index.
|
||||
# That is, functions have their own node indexes, 0 to N. data have their own node indexes, 0 to N.
|
||||
source_index = node_indexes_by_node[g.nodes[source]["type"]][source]
|
||||
destination_index = node_indexes_by_node[g.nodes[destination]["type"]][destination]
|
||||
|
||||
source_indexes.append(source_index)
|
||||
destination_indexes.append(destination_index)
|
||||
|
||||
for attribute, default_value in edge_type.attributes.items():
|
||||
value = attrs.get(attribute, default_value)
|
||||
attr_values[attribute].append(value)
|
||||
|
||||
data[edge_type.source_type.type, edge_type.key, edge_type.destination_type.type].edge_index = torch.stack(
|
||||
[
|
||||
torch.tensor(source_indexes),
|
||||
torch.tensor(destination_indexes),
|
||||
]
|
||||
)
|
||||
if attr_values:
|
||||
# attribute order is implicit in the EDGE_TYPES data model above.
|
||||
data[edge_type.source_type.type, edge_type.key, edge_type.destination_type.type].edge_attr = torch.stack(
|
||||
[torch.tensor(values) for values in attr_values.values()], dim=-1
|
||||
).float()
|
||||
|
||||
return LoadedGraph(
|
||||
data,
|
||||
node_mapping,
|
||||
)
|
||||
|
||||
|
||||
def train_main(args: argparse.Namespace) -> int:
|
||||
if not args.graph.is_file():
|
||||
raise ValueError("graph file doesn't exist")
|
||||
|
||||
logger.debug("loading torch")
|
||||
import torch
|
||||
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
seed = 42
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
|
||||
logger.debug("reading graph from disk")
|
||||
g = nx.read_gexf(args.graph)
|
||||
|
||||
# Initial model: learn to find functions that reference a string.
|
||||
#
|
||||
# Once this works, then we can try a more complex model (edge features),
|
||||
# and ultimately an edge classifier.
|
||||
#
|
||||
# Ground truth from existing patterns like:
|
||||
#
|
||||
# function > references > data (:is_string=True)
|
||||
|
||||
for a, b, key, attrs in g.edges(data=True, keys=True):
|
||||
match (g.nodes[a]["type"], key, g.nodes[b]["type"]):
|
||||
case ("function", "reference", "data"):
|
||||
|
||||
if g.nodes[b].get("is_string"):
|
||||
g.nodes[a]["does_reference_string"] = True
|
||||
logger.debug("%s > reference > %s (string)", g.nodes[a]["repr"], g.nodes[b]["repr"])
|
||||
|
||||
case ("function", "reference", "function"):
|
||||
# The data model supports this.
|
||||
# Like passing a function pointer as a callback
|
||||
continue
|
||||
case ("data", "reference", "data"):
|
||||
# We don't support this.
|
||||
continue
|
||||
case ("data", "reference", "function"):
|
||||
# We don't support this.
|
||||
continue
|
||||
case (_, "call", _):
|
||||
continue
|
||||
case (_, "neighbor", _):
|
||||
continue
|
||||
case _:
|
||||
print(a, b, key, attrs, g.nodes[a], g.nodes[b])
|
||||
raise ValueError("unexpected structure")
|
||||
|
||||
# map existing attributes to the ground_truth attribute
|
||||
# for ease of updating the model/training.
|
||||
for node, attrs in g.nodes(data=True):
|
||||
if attrs["type"] != "function":
|
||||
continue
|
||||
|
||||
attrs["ground_truth"] = attrs.get("does_reference_string", False)
|
||||
|
||||
logger.debug("loading graph into torch")
|
||||
lg = load_graph(g)
|
||||
data = lg.data
|
||||
|
||||
data['data'].y = torch.zeros(data['data'].num_nodes, dtype=torch.long)
|
||||
data['function'].y = torch.zeros(data['function'].num_nodes, dtype=torch.long)
|
||||
true_indices = []
|
||||
|
||||
for node, attrs in g.nodes(data=True):
|
||||
if attrs.get("ground_truth"):
|
||||
print("true: ", g.nodes[node]["repr"])
|
||||
node_index = lg.mapping[attrs["type"]][node]
|
||||
print("index", attrs["type"], node_index)
|
||||
print(" ", node)
|
||||
print(" ", lg.mapping[attrs["type"]][node_index])
|
||||
|
||||
true_indices.append(node_index)
|
||||
# true_indices.append(data['function'].node_id[node_index].item())
|
||||
# print("true index: ", node_index, data['function'].node_id[node_index].item())
|
||||
|
||||
data['function'].y[true_indices] = 1
|
||||
print(data['function'].y)
|
||||
|
||||
# TODO
|
||||
import torch_geometric.transforms as T
|
||||
data = T.ToUndirected()(data)
|
||||
# data = T.AddSelfLoops()(data)
|
||||
data = T.NormalizeFeatures()(data)
|
||||
|
||||
print(data)
|
||||
|
||||
from torch_geometric.nn import RGCNConv, to_hetero, SAGEConv, Linear
|
||||
import torch.nn.functional as F
|
||||
|
||||
class GNN(torch.nn.Module):
|
||||
def __init__(self, hidden_channels, out_channels):
|
||||
super().__init__()
|
||||
self.conv1 = SAGEConv((-1, -1), hidden_channels)
|
||||
self.conv2 = SAGEConv((-1, -1), hidden_channels)
|
||||
self.lin = Linear(hidden_channels, out_channels)
|
||||
|
||||
def forward(self, x, edge_index):
|
||||
x = self.conv1(x, edge_index).relu()
|
||||
x = self.conv2(x, edge_index)
|
||||
x = self.lin(x)
|
||||
return x
|
||||
|
||||
model = GNN(hidden_channels=4, out_channels=2)
|
||||
# metadata: tuple[list of node types, list of edge types (source, key, dest)]
|
||||
model = to_hetero(model, data.metadata(), aggr='sum')
|
||||
# model.print_readable()
|
||||
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
train_nodes, test_nodes = train_test_split(
|
||||
torch.arange(data['function'].num_nodes), test_size=0.2, random_state=42
|
||||
)
|
||||
|
||||
train_mask = torch.zeros(data['function'].num_nodes, dtype=torch.bool)
|
||||
# train_mask[train_nodes] = True
|
||||
train_mask[:] = True
|
||||
|
||||
test_mask = torch.zeros(data['function'].num_nodes, dtype=torch.bool)
|
||||
# test_mask[test_nodes] = True
|
||||
test_mask[:] = True
|
||||
|
||||
data['function'].train_mask = train_mask
|
||||
data['function'].test_mask = test_mask
|
||||
|
||||
logger.debug("training")
|
||||
for epoch in range(999):
|
||||
model.train()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# don't use edge attrs right now.
|
||||
out = model(data.x_dict, data.edge_index_dict) # data.edge_attr_dict)
|
||||
|
||||
out_function = out['function']
|
||||
y_function = data['function'].y
|
||||
|
||||
mask = data['function'].train_mask
|
||||
|
||||
# When classifying "function has string reference"
|
||||
# there is a major class imbalance, because 95% of function's don't reference a string,
|
||||
# so the model just learns to predict "no".
|
||||
# Therefore, weight the classes so that a "yes" prediction is much more valuable.
|
||||
class_counts = torch.bincount(data['function'].y[mask])
|
||||
class_weights = 1.0 / class_counts.float()
|
||||
class_weights = class_weights / class_weights.sum() * len(class_counts)
|
||||
|
||||
# CrossEntropyLoss(): the most common choice for node classification with mutually exclusive classes.
|
||||
# BCEWithLogitsLoss(): multi-label node classification
|
||||
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
|
||||
|
||||
loss = criterion(out_function[mask], y_function[mask])
|
||||
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
logger.info(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
|
||||
if loss <= 0.0001:
|
||||
logger.info("no more loss")
|
||||
break
|
||||
|
||||
logger.debug("evaluating")
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
out = model(data.x_dict, data.edge_index_dict) # TODO: edge attrs
|
||||
|
||||
mask = data['function'].test_mask
|
||||
pred = torch.argmax(out['function'][mask], dim=1)
|
||||
truth = data['function'].y[mask].int()
|
||||
|
||||
print("pred", pred[:32])
|
||||
print("truth", truth[:32])
|
||||
# print("index", data['function'].node_id[mask])
|
||||
# print("83: ", g.nodes[lg.mapping['function'][83]]['repr'])
|
||||
|
||||
accuracy = (pred == truth).float().mean()
|
||||
|
||||
# pred = (out[data['function'].test_mask] > 0).int().squeeze()
|
||||
# accuracy = (pred == data['function'].y[data['function'].test_mask]).float().mean()
|
||||
print(f'Accuracy: {accuracy:.4f}')
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def main(argv=None) -> int:
|
||||
if argv is None:
|
||||
argv = sys.argv[1:]
|
||||
|
||||
parser = argparse.ArgumentParser(description="Identify object boundaries in compiled programs")
|
||||
capa.main.install_common_args(parser, wanted={})
|
||||
subparsers = parser.add_subparsers(title="subcommands", required=True)
|
||||
|
||||
generate_parser = subparsers.add_parser("generate", help="generate graph for a sample")
|
||||
generate_parser.add_argument("assemblage_database", type=Path, help="path to Assemblage database")
|
||||
generate_parser.add_argument("assemblage_directory", type=Path, help="path to Assemblage samples directory")
|
||||
generate_parser.add_argument("binary_id", type=int, help="primary key of binary to inspect")
|
||||
generate_parser.set_defaults(func=generate_main)
|
||||
|
||||
num_cores = os.cpu_count() or 1
|
||||
default_workers = max(1, num_cores - 2)
|
||||
generate_all_parser = subparsers.add_parser("generate_all", help="generate graphs for all samples")
|
||||
generate_all_parser.add_argument("assemblage_database", type=Path, help="path to Assemblage database")
|
||||
generate_all_parser.add_argument("assemblage_directory", type=Path, help="path to Assemblage samples directory")
|
||||
generate_all_parser.add_argument("output_directory", type=Path, help="path to output directory")
|
||||
generate_all_parser.add_argument(
|
||||
"--num_workers", type=int, default=default_workers, help="number of workers to use"
|
||||
)
|
||||
generate_all_parser.set_defaults(func=generate_all_main)
|
||||
|
||||
cluster_parser = subparsers.add_parser("cluster", help="cluster an existing graph")
|
||||
cluster_parser.add_argument("graph", type=Path, help="path to a graph file")
|
||||
cluster_parser.set_defaults(func=cluster_main)
|
||||
|
||||
train_parser = subparsers.add_parser("train", help="train using an existing graph")
|
||||
train_parser.add_argument("graph", type=Path, help="path to a graph file")
|
||||
train_parser.set_defaults(func=train_main)
|
||||
|
||||
args = parser.parse_args(args=argv)
|
||||
|
||||
try:
|
||||
capa.main.handle_common_args(args)
|
||||
except capa.main.ShouldExitError as e:
|
||||
return e.status_code
|
||||
|
||||
logging.getLogger("goblin.pe").setLevel(logging.WARNING)
|
||||
|
||||
return args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -125,7 +125,6 @@ def collect(args):
|
||||
key = str(file)
|
||||
|
||||
for backend in BACKENDS:
|
||||
|
||||
if (backend, file.name) in {
|
||||
("binja", "0953cc3b77ed2974b09e3a00708f88de931d681e2d0cb64afbaf714610beabe6.exe_")
|
||||
}:
|
||||
|
||||
@@ -75,15 +75,27 @@ def _render_expression_tree(
|
||||
tree_index: int,
|
||||
o: io.StringIO,
|
||||
):
|
||||
|
||||
expression_index = operand.expression_index[tree_index]
|
||||
expression = be2.expression[expression_index]
|
||||
children_tree_indexes: list[int] = expression_tree[tree_index]
|
||||
|
||||
if expression.type == BinExport2.Expression.REGISTER:
|
||||
o.write(expression.symbol)
|
||||
assert len(children_tree_indexes) == 0
|
||||
return
|
||||
assert len(children_tree_indexes) <= 1
|
||||
|
||||
if len(children_tree_indexes) == 0:
|
||||
return
|
||||
elif len(children_tree_indexes) == 1:
|
||||
# like for aarch64 with vector instructions, indicating vector data size:
|
||||
#
|
||||
# FADD V0.4S, V1.4S, V2.4S
|
||||
#
|
||||
# see: https://github.com/mandiant/capa/issues/2528
|
||||
child_index = children_tree_indexes[0]
|
||||
_render_expression_tree(be2, operand, expression_tree, child_index, o)
|
||||
return
|
||||
else:
|
||||
raise NotImplementedError(len(children_tree_indexes))
|
||||
|
||||
elif expression.type == BinExport2.Expression.SYMBOL:
|
||||
o.write(expression.symbol)
|
||||
@@ -107,8 +119,22 @@ def _render_expression_tree(
|
||||
|
||||
elif expression.type == BinExport2.Expression.IMMEDIATE_INT:
|
||||
o.write(f"0x{expression.immediate:X}")
|
||||
assert len(children_tree_indexes) == 0
|
||||
return
|
||||
assert len(children_tree_indexes) <= 1
|
||||
|
||||
if len(children_tree_indexes) == 0:
|
||||
return
|
||||
elif len(children_tree_indexes) == 1:
|
||||
# the ghidra exporter can produce some weird expressions,
|
||||
# particularly for MSRs, like for:
|
||||
#
|
||||
# sreg(3, 0, c.0, c.4, 4)
|
||||
#
|
||||
# see: https://github.com/mandiant/capa/issues/2530
|
||||
child_index = children_tree_indexes[0]
|
||||
_render_expression_tree(be2, operand, expression_tree, child_index, o)
|
||||
return
|
||||
else:
|
||||
raise NotImplementedError(len(children_tree_indexes))
|
||||
|
||||
elif expression.type == BinExport2.Expression.SIZE_PREFIX:
|
||||
# like: b4
|
||||
@@ -124,12 +150,16 @@ def _render_expression_tree(
|
||||
return
|
||||
|
||||
elif expression.type == BinExport2.Expression.OPERATOR:
|
||||
|
||||
if len(children_tree_indexes) == 1:
|
||||
# prefix operator, like "ds:"
|
||||
if expression.symbol != "!":
|
||||
o.write(expression.symbol)
|
||||
|
||||
if expression.symbol in ("lsl", "lsr"):
|
||||
# like: lsl 16
|
||||
# not like: lsl16
|
||||
o.write(" ")
|
||||
|
||||
child_index = children_tree_indexes[0]
|
||||
_render_expression_tree(be2, operand, expression_tree, child_index, o)
|
||||
|
||||
@@ -143,7 +173,13 @@ def _render_expression_tree(
|
||||
child_a = children_tree_indexes[0]
|
||||
child_b = children_tree_indexes[1]
|
||||
_render_expression_tree(be2, operand, expression_tree, child_a, o)
|
||||
|
||||
o.write(expression.symbol)
|
||||
if expression.symbol == ",":
|
||||
# like: 10, 20
|
||||
# not like 10,20
|
||||
o.write(" ")
|
||||
|
||||
_render_expression_tree(be2, operand, expression_tree, child_b, o)
|
||||
return
|
||||
|
||||
@@ -154,11 +190,19 @@ def _render_expression_tree(
|
||||
child_c = children_tree_indexes[2]
|
||||
_render_expression_tree(be2, operand, expression_tree, child_a, o)
|
||||
o.write(expression.symbol)
|
||||
if expression.symbol == ",":
|
||||
o.write(" ")
|
||||
_render_expression_tree(be2, operand, expression_tree, child_b, o)
|
||||
o.write(expression.symbol)
|
||||
if expression.symbol == ",":
|
||||
o.write(" ")
|
||||
_render_expression_tree(be2, operand, expression_tree, child_c, o)
|
||||
return
|
||||
|
||||
elif len(children_tree_indexes) == 0:
|
||||
# like when all subtrees have been pruned: don't render anything
|
||||
return
|
||||
|
||||
else:
|
||||
raise NotImplementedError(len(children_tree_indexes))
|
||||
|
||||
@@ -250,7 +294,6 @@ def inspect_instruction(be2: BinExport2, instruction: BinExport2.Instruction, ad
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
|
||||
if argv is None:
|
||||
argv = sys.argv[1:]
|
||||
|
||||
@@ -365,10 +408,17 @@ def main(argv=None):
|
||||
operands = []
|
||||
for operand_index in instruction.operand_index:
|
||||
operand = be2.operand[operand_index]
|
||||
# Ghidra bug where empty operands (no expressions) may
|
||||
# exist so we skip those for now (see https://github.com/NationalSecurityAgency/ghidra/issues/6817)
|
||||
if len(operand.expression_index) > 0:
|
||||
operands.append(render_operand(be2, operand, index=operand_index))
|
||||
if not operand.expression_index:
|
||||
# Ghidra bug where empty operands (no expressions) may
|
||||
# exist so we skip those for now (see https://github.com/NationalSecurityAgency/ghidra/issues/6817)
|
||||
continue
|
||||
|
||||
op = render_operand(be2, operand, index=operand_index)
|
||||
if not op:
|
||||
# operand has been pruned away, so don't show it
|
||||
continue
|
||||
|
||||
operands.append(op)
|
||||
|
||||
call_targets = ""
|
||||
if instruction.call_target:
|
||||
|
||||
Submodule tests/data updated: 40dce09a58...ea10c47b32
@@ -180,6 +180,12 @@ def get_binja_extractor(path: Path):
|
||||
if path.name.endswith("kernel32-64.dll_"):
|
||||
settings.set_bool("pdb.loadGlobalSymbols", old_pdb)
|
||||
|
||||
# TODO(xusheng6): Temporary fix for https://github.com/mandiant/capa/issues/2507. Remove this once it is fixed in
|
||||
# binja
|
||||
if "al-khaser_x64.exe_" in path.name:
|
||||
bv.create_user_function(0x14004B4F0)
|
||||
bv.update_analysis_and_wait()
|
||||
|
||||
extractor = capa.features.extractors.binja.extractor.BinjaFeatureExtractor(bv)
|
||||
|
||||
# overload the extractor so that the fixture exposes `extractor.path`
|
||||
@@ -332,6 +338,8 @@ def get_data_path_by_name(name) -> Path:
|
||||
return CD / "data" / "Practical Malware Analysis Lab 12-04.exe_"
|
||||
elif name == "pma16-01":
|
||||
return CD / "data" / "Practical Malware Analysis Lab 16-01.exe_"
|
||||
elif name == "pma16-01_binja_db":
|
||||
return CD / "data" / "Practical Malware Analysis Lab 16-01.exe_.bndb"
|
||||
elif name == "pma21-01":
|
||||
return CD / "data" / "Practical Malware Analysis Lab 21-01.exe_"
|
||||
elif name == "al-khaser x86":
|
||||
@@ -1387,6 +1395,43 @@ FEATURE_PRESENCE_TESTS_IDA = [
|
||||
("mimikatz", "file", capa.features.file.Import("cabinet.FCIAddFile"), True),
|
||||
]
|
||||
|
||||
FEATURE_BINJA_DATABASE_TESTS = sorted(
|
||||
[
|
||||
# insn/regex
|
||||
("pma16-01_binja_db", "function=0x4021B0", capa.features.common.Regex("HTTP/1.0"), True),
|
||||
(
|
||||
"pma16-01_binja_db",
|
||||
"function=0x402F40",
|
||||
capa.features.common.Regex("www.practicalmalwareanalysis.com"),
|
||||
True,
|
||||
),
|
||||
(
|
||||
"pma16-01_binja_db",
|
||||
"function=0x402F40",
|
||||
capa.features.common.Substring("practicalmalwareanalysis.com"),
|
||||
True,
|
||||
),
|
||||
("pma16-01_binja_db", "file", capa.features.file.FunctionName("__aulldiv"), True),
|
||||
# os & format & arch
|
||||
("pma16-01_binja_db", "file", OS(OS_WINDOWS), True),
|
||||
("pma16-01_binja_db", "file", OS(OS_LINUX), False),
|
||||
("pma16-01_binja_db", "function=0x404356", OS(OS_WINDOWS), True),
|
||||
("pma16-01_binja_db", "function=0x404356,bb=0x4043B9", OS(OS_WINDOWS), True),
|
||||
("pma16-01_binja_db", "file", Arch(ARCH_I386), True),
|
||||
("pma16-01_binja_db", "file", Arch(ARCH_AMD64), False),
|
||||
("pma16-01_binja_db", "function=0x404356", Arch(ARCH_I386), True),
|
||||
("pma16-01_binja_db", "function=0x404356,bb=0x4043B9", Arch(ARCH_I386), True),
|
||||
("pma16-01_binja_db", "file", Format(FORMAT_PE), True),
|
||||
("pma16-01_binja_db", "file", Format(FORMAT_ELF), False),
|
||||
# format is also a global feature
|
||||
("pma16-01_binja_db", "function=0x404356", Format(FORMAT_PE), True),
|
||||
],
|
||||
# order tests by (file, item)
|
||||
# so that our LRU cache is most effective.
|
||||
key=lambda t: (t[0], t[1]),
|
||||
)
|
||||
|
||||
|
||||
FEATURE_COUNT_TESTS = [
|
||||
("mimikatz", "function=0x40E5C2", capa.features.basicblock.BasicBlock(), 7),
|
||||
("mimikatz", "function=0x4702FD", capa.features.common.Characteristic("calls from"), 0),
|
||||
|
||||
@@ -36,7 +36,7 @@ except ImportError:
|
||||
@pytest.mark.skipif(binja_present is False, reason="Skip binja tests if the binaryninja Python API is not installed")
|
||||
@fixtures.parametrize(
|
||||
"sample,scope,feature,expected",
|
||||
fixtures.FEATURE_PRESENCE_TESTS + fixtures.FEATURE_SYMTAB_FUNC_TESTS,
|
||||
fixtures.FEATURE_PRESENCE_TESTS + fixtures.FEATURE_SYMTAB_FUNC_TESTS + fixtures.FEATURE_BINJA_DATABASE_TESTS,
|
||||
indirect=["sample", "scope"],
|
||||
)
|
||||
def test_binja_features(sample, scope, feature, expected):
|
||||
|
||||
6
web/explorer/package-lock.json
generated
6
web/explorer/package-lock.json
generated
@@ -2676,9 +2676,9 @@
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/nanoid": {
|
||||
"version": "3.3.7",
|
||||
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.7.tgz",
|
||||
"integrity": "sha512-eSRppjcPIatRIMC1U6UngP8XFcz8MQWGQdt1MTBQ7NaAmvXDfvNxbvWV3x2y6CdEUciCSsDHDQZbhYaB8QEo2g==",
|
||||
"version": "3.3.8",
|
||||
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.8.tgz",
|
||||
"integrity": "sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
|
||||
4
web/explorer/releases/CHANGELOG.md
Normal file
4
web/explorer/releases/CHANGELOG.md
Normal file
@@ -0,0 +1,4 @@
|
||||
## capa-explorer-web-v1.0.0-6a2330c
|
||||
- Release Date: 2024-11-27 13:03:17 UTC
|
||||
- SHA256: 3a7cf6927b0e8595f08b685669b215ef779eade622efd5e8d33efefadd849025
|
||||
|
||||
BIN
web/explorer/releases/capa-explorer-web-v1.0.0-6a2330c.zip
Normal file
BIN
web/explorer/releases/capa-explorer-web-v1.0.0-6a2330c.zip
Normal file
Binary file not shown.
@@ -215,6 +215,20 @@
|
||||
|
||||
<h2 class="mt-3">Tool Updates</h2>
|
||||
|
||||
<h3 class="mt-2">v8.0.0 (<em>2024-12-09</em>)</h3>
|
||||
<p class="mt-0">
|
||||
This point release fixes an issue with the IDAPython API to now handle IDA Pro 8.3, 8.4, and 9.0 correctly.
|
||||
</p>
|
||||
|
||||
<h3 class="mt-2">v8.0.0 (<em>2024-12-09</em>)</h3>
|
||||
<p class="mt-0">
|
||||
capa <a href="https://github.com/mandiant/capa/releases/tag/v8.0.0">v8.0.0</a> adds support for IDA Pro 9.0 (and idalib). The release comes with various improvements and bug fixes for the Binary Ninja backend (including to load with database files) -- thanks to @xusheng6.
|
||||
<br />
|
||||
Additional bug fixes improve the dynamic and BinExport backends.
|
||||
<br />
|
||||
capa version 8 now requires Python 3.10 or newer.
|
||||
</p>
|
||||
|
||||
<h3 class="mt-2">v7.4.0 (<em>2024-10-04</em>)</h3>
|
||||
<p class="mt-0">
|
||||
The <a href="https://github.com/mandiant/capa/releases/tag/v7.4.0">v7.4.0</a> capa release fixes a bug when processing VMRay analysis archives and enhances API extraction for all dynamic backends. For better terminal rendering capa now solely relies on the rich library.<br />
|
||||
|
||||
Reference in New Issue
Block a user