From aacfcaaa239db18b5c68986163bf38f6823fb18b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 3 Mar 2023 09:52:50 -0700
Subject: [PATCH 01/12] explorer: improve embedded PE detection

---
 capa/features/extractors/ida/file.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/capa/features/extractors/ida/file.py b/capa/features/extractors/ida/file.py
index 0d380a88..532d5a89 100644
--- a/capa/features/extractors/ida/file.py
+++ b/capa/features/extractors/ida/file.py
@@ -21,12 +21,14 @@ from capa.features.file import Export, Import, Section, FunctionName
 from capa.features.common import FORMAT_PE, FORMAT_ELF, Format, String, Feature, Characteristic
 from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress, AbsoluteVirtualAddress
 
+MAX_OFFSET_PE_AFTER_MZ = 0x200
+
 
 def check_segment_for_pe(seg: idaapi.segment_t) -> Iterator[Tuple[int, int]]:
     """check segment for embedded PE
 
     adapted for IDA from:
-    https://github.com/vivisect/vivisect/blob/7be4037b1cecc4551b397f840405a1fc606f9b53/PE/carve.py#L19
+    https://github.com/vivisect/vivisect/blob/91e8419a861f49779f18316f155311967e696836/PE/carve.py#L25
     """
     seg_max = seg.end_ea
     mz_xor = [
@@ -40,13 +42,14 @@ def check_segment_for_pe(seg: idaapi.segment_t) -> Iterator[Tuple[int, int]]:
 
     todo = []
     for mzx, pex, i in mz_xor:
+        # find all segment offsets containing XOR'd "MZ" bytes
         for off in capa.features.extractors.ida.helpers.find_byte_sequence(seg.start_ea, seg.end_ea, mzx):
             todo.append((off, mzx, pex, i))
 
     while len(todo):
         off, mzx, pex, i = todo.pop()
 
-        # The MZ header has one field we will check e_lfanew is at 0x3c
+        # MZ header has one field we will check e_lfanew is at 0x3c
         e_lfanew = off + 0x3C
 
         if seg_max < (e_lfanew + 4):
@@ -54,6 +57,10 @@ def check_segment_for_pe(seg: idaapi.segment_t) -> Iterator[Tuple[int, int]]:
 
         newoff = struct.unpack("<I", capa.features.extractors.helpers.xor_static(idc.get_bytes(e_lfanew, 4), i))[0]
 
+        # assume XOR'd "PE" bytes exist within threshold
+        if newoff > MAX_OFFSET_PE_AFTER_MZ:
+            continue
+
         peoff = off + newoff
         if seg_max < (peoff + 2):
             continue
@@ -61,9 +68,6 @@ def check_segment_for_pe(seg: idaapi.segment_t) -> Iterator[Tuple[int, int]]:
         if idc.get_bytes(peoff, 2) == pex:
             yield off, i
 
-        for nextres in capa.features.extractors.ida.helpers.find_byte_sequence(off + 1, seg.end_ea, mzx):
-            todo.append((nextres, mzx, pex, i))
-
 
 def extract_file_embedded_pe() -> Iterator[Tuple[Feature, Address]]:
     """extract embedded PE features

From 14c18727db75e6b04dfb8b588f331173ed31ef63 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 3 Mar 2023 09:55:45 -0700
Subject: [PATCH 02/12] update CHANGELOG

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 543db9ac..52f6de8e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,7 @@
 - match: extend OS characteristic to match OS_ANY to all supported OSes #1324 @mike-hunhoff
 - explorer: fix exception when plugin loaded in IDA hosted under idat #1341 @mike-hunhoff
 - extractor: fix IDA and vivisect string and bytes features overlap and tests #1327 #1336 @xusheng6 
+- explorer: improve embedded PE detection #1344 @mike-hunhoff
 
 ### capa explorer IDA Pro plugin
 

From 02dc42154bf97b71ae730e49149e95ed51bfdaf0 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 6 Mar 2023 08:53:57 -0700
Subject: [PATCH 03/12] Update CHANGELOG.md

Co-authored-by: Willi Ballenthin <willi.ballenthin@gmail.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 52f6de8e..439b0827 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,7 +27,7 @@
 - match: extend OS characteristic to match OS_ANY to all supported OSes #1324 @mike-hunhoff
 - explorer: fix exception when plugin loaded in IDA hosted under idat #1341 @mike-hunhoff
 - extractor: fix IDA and vivisect string and bytes features overlap and tests #1327 #1336 @xusheng6 
-- explorer: improve embedded PE detection #1344 @mike-hunhoff
+- explorer: improve embedded PE detection performance and reduce FP potential #1344 @mike-hunhoff
 
 ### capa explorer IDA Pro plugin
 

From 95f23dafe57930ffef8f8df1aed4dacce4389bc8 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 6 Mar 2023 08:55:32 -0700
Subject: [PATCH 04/12] Update CHANGELOG.md

---
 CHANGELOG.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 439b0827..ac3ba707 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,11 +25,11 @@
 ### Bug Fixes
 - extractor: fix vivisect loop detection corner case #1310 @mr-tz
 - match: extend OS characteristic to match OS_ANY to all supported OSes #1324 @mike-hunhoff
-- explorer: fix exception when plugin loaded in IDA hosted under idat #1341 @mike-hunhoff
 - extractor: fix IDA and vivisect string and bytes features overlap and tests #1327 #1336 @xusheng6 
-- explorer: improve embedded PE detection performance and reduce FP potential #1344 @mike-hunhoff
 
 ### capa explorer IDA Pro plugin
+- fix exception when plugin loaded in IDA hosted under idat #1341 @mike-hunhoff
+- improve embedded PE detection performance and reduce FP potential #1344 @mike-hunhoff
 
 ### Development
 

From 51286380717dd60be9e2ae1d89b8f8ebd8ccf12c Mon Sep 17 00:00:00 2001
From: manasghandat <95558940+manasghandat@users.noreply.github.com>
Date: Thu, 9 Mar 2023 11:58:47 +0530
Subject: [PATCH 05/12] code style: update lint.py (#1352)

* code style: update lint.py
---
 scripts/lint.py | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/scripts/lint.py b/scripts/lint.py
index c049dd6e..cf56f1a8 100644
--- a/scripts/lint.py
+++ b/scripts/lint.py
@@ -648,11 +648,11 @@ class FormatStringQuotesIncorrect(Lint):
                     continue
                 if value.style is None:
                     # no quotes
-                    self.recommendation = 'add double quotes to "%s"' % value.value
+                    self.recommendation = f'add double quotes to "{value.value}"'
                     return True
                 if value.style == "'":
                     # single quote
-                    self.recommendation = 'change single quotes to double quotes for "%s"' % value.value
+                    self.recommendation = f'change single quotes to double quotes for "{value.value}"'
                     return True
 
             elif isinstance(key, ruamel.yaml.ScalarEvent) and key.value == "substring":
@@ -662,11 +662,11 @@ class FormatStringQuotesIncorrect(Lint):
                     continue
                 if value.style is None:
                     # no quotes
-                    self.recommendation = 'add double quotes to "%s"' % value.value
+                    self.recommendation = f'add double quotes to "{value.value}"'
                     return True
                 if value.style == "'":
                     # single quote
-                    self.recommendation = 'change single quotes to double quotes for "%s"' % value.value
+                    self.recommendation = f'change single quotes to double quotes for "{value.value}"'
                     return True
 
             else:
@@ -816,25 +816,12 @@ def lint_rule(ctx: Context, rule: Rule):
         # and ends up just producing a lot of noise.
         if not (is_nursery_rule(rule) and len(violations) == 1 and violations[0].name == "missing examples"):
             print("")
-            print(
-                "%s%s"
-                % (
-                    "    (nursery) " if is_nursery_rule(rule) else "",
-                    rule.name,
-                )
-            )
+            print(f'{"    (nursery) " if is_nursery_rule(rule) else ""} {rule.name}')
 
             for violation in violations:
                 print(
-                    "%s  %s: %s: %s"
-                    % (
-                        "    " if is_nursery_rule(rule) else "",
-                        Lint.WARN if is_nursery_rule(rule) else violation.level,
-                        violation.name,
-                        violation.recommendation,
-                    )
+                    f"{'    ' if is_nursery_rule(rule) else ''}  {Lint.WARN if is_nursery_rule(rule) else violation.level}: {violation.name}: {violation.recommendation}"
                 )
-
             print("")
 
     if is_nursery_rule(rule):
@@ -860,8 +847,8 @@ def lint_rule(ctx: Context, rule: Rule):
 
         if (not lints_failed) and (not lints_warned) and has_examples:
             print("")
-            print("%s%s" % ("    (nursery) ", rule.name))
-            print("%s  %s: %s: %s" % ("    ", Lint.WARN, green("no lint failures"), "Graduate the rule"))
+            print(f'{"    (nursery) " if is_nursery_rule(rule) else ""} {rule.name}')
+            print(f"      {Lint.WARN}: {green('no lint failures')}: Graduate the rule")
             print("")
     else:
         lints_failed = len(tuple(filter(lambda v: v.level == Lint.FAIL, violations)))
@@ -921,7 +908,7 @@ def lint(ctx: Context):
         with redirecting_print_to_tqdm():
             for rule in pbar:
                 name = rule.name
-                pbar.set_description(width("linting rule: %s" % (name), 48))
+                pbar.set_description(width(f"linting rule: {name}", 48))
                 ret[name] = lint_rule(ctx, rule)
 
     return ret

From d8f89d49d47ec7a0af7db83a2d0f29588e100c70 Mon Sep 17 00:00:00 2001
From: AG <98327736+ggold7046@users.noreply.github.com>
Date: Fri, 10 Mar 2023 01:17:59 +0530
Subject: [PATCH 06/12] Update import-to-bn.py

Used f string for enhanced readability.
---
 scripts/import-to-bn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/import-to-bn.py b/scripts/import-to-bn.py
index b9b48cd8..d157af44 100644
--- a/scripts/import-to-bn.py
+++ b/scripts/import-to-bn.py
@@ -93,9 +93,9 @@ def load_analysis(bv):
     rows = sorted(rows)
     for ns, name, va in rows:
         if ns:
-            cmt = "%s (%s)" % (name, ns)
+            cmt = f"{name} ({ns})"
         else:
-            cmt = "%s" % (name,)
+            cmt = f"{name}"
 
         binaryninja.log_info("0x%x: %s" % (va, cmt))
         try:

From 50935372cab82f0184d9142ffe243360c56fff29 Mon Sep 17 00:00:00 2001
From: AG <98327736+ggold7046@users.noreply.github.com>
Date: Fri, 10 Mar 2023 01:36:17 +0530
Subject: [PATCH 07/12] Update import-to-ida.py

Updated with f string for enhanced readability.
---
 scripts/import-to-ida.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/import-to-ida.py b/scripts/import-to-ida.py
index e5d4b5ad..a32b45b9 100644
--- a/scripts/import-to-ida.py
+++ b/scripts/import-to-ida.py
@@ -101,9 +101,9 @@ def main():
     rows = sorted(rows)
     for ns, name, va in rows:
         if ns:
-            cmt = "%s (%s)" % (name, ns)
+            cmt = f"{name} ({ns})"
         else:
-            cmt = "%s" % (name,)
+            cmt = f"{name}"
 
         logger.info("0x%x: %s", va, cmt)
         try:

From eaeef59583f38532e91b2b0cb71ce5bb0b6184d8 Mon Sep 17 00:00:00 2001
From: AG <98327736+ggold7046@users.noreply.github.com>
Date: Fri, 10 Mar 2023 13:03:04 +0530
Subject: [PATCH 08/12] Update insn.py

Updated with f strings for enhanced readability.
---
 capa/features/insn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/insn.py b/capa/features/insn.py
index 1f1c0171..030784fe 100644
--- a/capa/features/insn.py
+++ b/capa/features/insn.py
@@ -31,7 +31,7 @@ class _AccessFeature(Feature, abc.ABC):
         super().__init__(value, description=description)
         if access is not None:
             if access not in VALID_FEATURE_ACCESS:
-                raise ValueError("%s access type %s not valid" % (self.name, access))
+                raise ValueError(f"{self.name} access type {access} not valid")
         self.access = access
 
     def __hash__(self):

From df6de3446c14b1a2231741daf76f08124b867aa2 Mon Sep 17 00:00:00 2001
From: AG <98327736+ggold7046@users.noreply.github.com>
Date: Fri, 10 Mar 2023 13:10:02 +0530
Subject: [PATCH 09/12] Update file.py

Updated with f string for enhanced readability.
---
 capa/features/extractors/ida/file.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/ida/file.py b/capa/features/extractors/ida/file.py
index 532d5a89..a3da4c6a 100644
--- a/capa/features/extractors/ida/file.py
+++ b/capa/features/extractors/ida/file.py
@@ -106,13 +106,13 @@ def extract_file_import_names() -> Iterator[Tuple[Feature, Address]]:
             for name in capa.features.extractors.helpers.generate_symbols(info[0], info[1]):
                 yield Import(name), addr
             dll = info[0]
-            symbol = "#%d" % (info[2])
+            symbol = f"#{info[2]}"
         elif info[1]:
             dll = info[0]
             symbol = info[1]
         elif info[2]:
             dll = info[0]
-            symbol = "#%d" % (info[2])
+            symbol = f"#{info[2]}"
         else:
             continue
 
@@ -180,7 +180,7 @@ def extract_file_format() -> Iterator[Tuple[Feature, Address]]:
         # no file type to return when processing a binary file, but we want to continue processing
         return
     else:
-        raise NotImplementedError("unexpected file format: %d" % file_info.filetype)
+        raise NotImplementedError(f"unexpected file format: {file_info.filetype}")
 
 
 def extract_features() -> Iterator[Tuple[Feature, Address]]:

From 7031c68a85187881f307954cbca8d5e20d9e31c9 Mon Sep 17 00:00:00 2001
From: linpeiyu164 <selina.peiyu@gmail.com>
Date: Sat, 11 Mar 2023 00:07:24 +0800
Subject: [PATCH 10/12] fix wrong indentation level for args.backend

---
 capa/main.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index ba03c7a4..617df71f 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -853,15 +853,15 @@ def install_common_args(parser, wanted=None):
             help="select sample format, %s" % format_help,
         )
 
-        if "backend" in wanted:
-            parser.add_argument(
-                "-b",
-                "--backend",
-                type=str,
-                help="select the backend to use",
-                choices=(BACKEND_VIV,),
-                default=BACKEND_VIV,
-            )
+    if "backend" in wanted:
+        parser.add_argument(
+            "-b",
+            "--backend",
+            type=str,
+            help="select the backend to use",
+            choices=(BACKEND_VIV,),
+            default=BACKEND_VIV,
+        )
 
     if "rules" in wanted:
         parser.add_argument(

From 02e451a2b19e564b03a235af92bc32d3792a5c55 Mon Sep 17 00:00:00 2001
From: AG <98327736+ggold7046@users.noreply.github.com>
Date: Sat, 11 Mar 2023 12:29:59 +0530
Subject: [PATCH 11/12] Update profile-memory.py

Updated with f string for enhanced readability.
---
 scripts/profile-memory.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/profile-memory.py b/scripts/profile-memory.py
index 349f6ee9..c6e2df90 100644
--- a/scripts/profile-memory.py
+++ b/scripts/profile-memory.py
@@ -16,10 +16,10 @@ def display_top(snapshot, key_type="lineno", limit=10):
     )
     top_stats = snapshot.statistics(key_type)
 
-    print("Top %s lines" % limit)
+    print(f"Top {limit} lines")
     for index, stat in enumerate(top_stats[:limit], 1):
         frame = stat.traceback[0]
-        print("#%s: %s:%s: %.1f KiB" % (index, frame.filename, frame.lineno, stat.size / 1024))
+        print(f"#{index}: {frame.filename}:{frame.lineno}: {stat.size / 1024:.1f} KiB")
         line = linecache.getline(frame.filename, frame.lineno).strip()
         if line:
             print("    %s" % line)
@@ -27,9 +27,9 @@ def display_top(snapshot, key_type="lineno", limit=10):
     other = top_stats[limit:]
     if other:
         size = sum(stat.size for stat in other)
-        print("%s other: %.1f KiB" % (len(other), size / 1024))
+        print(f"{len(other)} other: {size / 1024:.1f} KiB")
     total = sum(stat.size for stat in top_stats)
-    print("Total allocated size: %.1f KiB" % (total / 1024))
+    print(f"Total allocated size: {total / 1024:.1f} KiB")
 
 
 def main():
@@ -45,11 +45,11 @@ def main():
     import capa.main
 
     count = int(os.environ.get("CAPA_PROFILE_COUNT", 1))
-    print("total iterations planned: %d (set via env var CAPA_PROFILE_COUNT)." % (count))
+    print(f"total iterations planned: {count} (set via env var CAPA_PROFILE_COUNT).")
     print()
 
     for i in range(count):
-        print("iteration %d/%d..." % (i + 1, count))
+        print(f"iteration {i + 1}/{count}...")
         with contextlib.redirect_stdout(io.StringIO()):
             with contextlib.redirect_stderr(io.StringIO()):
                 t0 = time.time()
@@ -59,9 +59,9 @@ def main():
                 gc.collect()
 
         process = psutil.Process(os.getpid())
-        print("  duration: %0.02fs" % (t1 - t0))
-        print("  rss: %.1f MiB" % (process.memory_info().rss / 1024 / 1024))
-        print("  vms: %.1f MiB" % (process.memory_info().vms / 1024 / 1024))
+        print(f"  duration: {t1 - t0:.02f}s")
+        print(f"  rss: {process.memory_info().rss / 1024 / 1024:.1f} MiB")
+        print(f"  vms: {process.memory_info().vms / 1024 / 1024:.1f} MiB")
 
     print("done.")
     gc.collect()

From 6321adc41196c402a61637577082919864f587a2 Mon Sep 17 00:00:00 2001
From: AG <98327736+ggold7046@users.noreply.github.com>
Date: Sat, 11 Mar 2023 12:43:22 +0530
Subject: [PATCH 12/12] Update match-function-id.py

Updated with f string for enhanced readability.
---
 scripts/match-function-id.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/match-function-id.py b/scripts/match-function-id.py
index e7b1ea38..0daa88bb 100644
--- a/scripts/match-function-id.py
+++ b/scripts/match-function-id.py
@@ -125,7 +125,7 @@ def main(argv=None):
         for analyzer in analyzers:
             name = viv_utils.flirt.match_function_flirt_signatures(analyzer.matcher, vw, function)
             if name:
-                print("0x%04x: %s" % (function, name))
+                print(f"0x{function:04x}: {name}")
 
     return 0