fix: auto-detect Weakpass total_pages instead of hardcoding 67

Replace the hardcoded `total_pages=67` default with `None` (auto-detect). On first call the function probes page 1 to read `last_page` from the Inertia `data-page` payload; if found it drives the thread pool with that count, if not found it falls back to a sequential walk until an empty page is returned, and if the probe itself fails it degrades gracefully to 67. Callers that pass `total_pages` explicitly are unaffected. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-28 12:03:11 -07:00 · 2026-04-24 23:38:24 -04:00
parent 9743f9673f
commit 8a9037459b
1 changed files with 88 additions and 29 deletions
--- a/hate_crack/api.py
+++ b/hate_crack/api.py
@@ -331,11 +331,92 @@ def register_torrent_cleanup():
    _TORRENT_CLEANUP_REGISTERED = True


-def fetch_all_weakpass_wordlists_multithreaded(total_pages=67, threads=10):
-    wordlists = []
+def fetch_all_weakpass_wordlists_multithreaded(total_pages=None, threads=10):
+    """Fetch all Weakpass wordlists. Auto-detects page count from the Inertia payload."""
+    headers = {"User-Agent": "Mozilla/5.0"}
+
+    def _fetch_page(page):
+        """Fetch a single page; return (entries, last_page_or_None)."""
+        url = f"https://weakpass.com/wordlists?page={page}"
+        r = requests.get(url, headers=headers, timeout=30)
+        soup = BeautifulSoup(r.text, "html.parser")
+        app_div = soup.find("div", id="app")
+        if not app_div or not app_div.has_attr("data-page"):
+            return [], None
+        data_page_val = app_div["data-page"]
+        if not isinstance(data_page_val, str):
+            data_page_val = str(data_page_val)
+        data = json.loads(data_page_val)
+        wordlists_raw = data.get("props", {}).get("wordlists", {})
+        last_page = None
+        if isinstance(wordlists_raw, dict):
+            # Check multiple possible locations for last_page
+            last_page = (
+                wordlists_raw.get("last_page")
+                or wordlists_raw.get("meta", {}).get("last_page")
+            )
+            if "data" in wordlists_raw:
+                wordlists_raw = wordlists_raw["data"]
+            else:
+                wordlists_raw = []
+        entries = [
+            {
+                "id": wl.get("id", ""),
+                "name": wl.get("name", ""),
+                "size": wl.get("size", ""),
+                "rank": wl.get("rank", ""),
+                "downloads": wl.get("downloaded", ""),
+                "torrent_url": wl.get("torrent_link", ""),
+            }
+            for wl in wordlists_raw
+        ]
+        return entries, last_page
+
+    # Determine total_pages via probe if not provided
+    if total_pages is None:
+        try:
+            entries1, detected = _fetch_page(1)
+            if detected:
+                total_pages = int(detected)
+                print(f"[i] Weakpass: {total_pages} pages detected")
+            elif entries1:
+                # last_page not in payload; fall back to sequential until empty
+                all_wordlists = list(entries1)
+                page = 2
+                while True:
+                    try:
+                        entries, _ = _fetch_page(page)
+                    except Exception as e:
+                        print(f"Error fetching page {page}: {e}")
+                        break
+                    if not entries:
+                        break
+                    all_wordlists.extend(entries)
+                    page += 1
+                # de-duplicate and return early
+                seen = set()
+                result = []
+                for wl in all_wordlists:
+                    if wl["name"] not in seen:
+                        result.append(wl)
+                        seen.add(wl["name"])
+                return result
+            else:
+                print("[!] Weakpass page 1 returned no results; falling back to 67 pages")
+                total_pages = 67
+                entries1 = []
+        except Exception as e:
+            print(f"[!] Weakpass probe failed ({e}); falling back to 67 pages")
+            total_pages = 67
+            entries1 = []
+    else:
+        entries1 = []
+
+    # Thread-pool fetch for pages 1..total_pages
+    # (If we already have entries1 from the probe, we skip page 1 in the pool)
+    wordlists = list(entries1)
    lock = threading.Lock()
    q = Queue()
-    headers = {"User-Agent": "Mozilla/5.0"}

    def worker():
        while True:
@@ -343,37 +424,15 @@ def fetch_all_weakpass_wordlists_multithreaded(total_pages=67, threads=10):
            if page is None:
                break
            try:
-                url = f"https://weakpass.com/wordlists?page={page}"
-                r = requests.get(url, headers=headers, timeout=30)
-                soup = BeautifulSoup(r.text, "html.parser")
-                app_div = soup.find("div", id="app")
-                if not app_div or not app_div.has_attr("data-page"):
-                    q.task_done()
-                    continue
-                data_page_val = app_div["data-page"]
-                if not isinstance(data_page_val, str):
-                    data_page_val = str(data_page_val)
-                data = json.loads(data_page_val)
-                wordlists_data = data.get("props", {}).get("wordlists", {})
-                if isinstance(wordlists_data, dict) and "data" in wordlists_data:
-                    wordlists_data = wordlists_data["data"]
+                entries, _ = _fetch_page(page)
                with lock:
-                    for wl in wordlists_data:
-                        wordlists.append(
-                            {
-                                "id": wl.get("id", ""),
-                                "name": wl.get("name", ""),
-                                "size": wl.get("size", ""),
-                                "rank": wl.get("rank", ""),
-                                "downloads": wl.get("downloaded", ""),
-                                "torrent_url": wl.get("torrent_link", ""),
-                            }
-                        )
+                    wordlists.extend(entries)
            except Exception as e:
                print(f"Error fetching page {page}: {e}")
            q.task_done()

-    for page in range(1, total_pages + 1):
+    start_page = 2 if entries1 else 1
+    for page in range(start_page, total_pages + 1):
        q.put(page)

    threads_list = []