From 889c2aab05c88ade6adf2025134cb39a3c239878 Mon Sep 17 00:00:00 2001 From: Carlos Polop Date: Mon, 13 Apr 2026 20:01:42 +0200 Subject: [PATCH] f --- .gitignore | 1 + scripts/translator.py | 30 +++++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9e9af90d4..77eced4ae 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ book/* hacktricks-preprocessor.log hacktricks-preprocessor-error.log searchindex.js +**.pyc \ No newline at end of file diff --git a/scripts/translator.py b/scripts/translator.py index c9489539c..a854479ab 100644 --- a/scripts/translator.py +++ b/scripts/translator.py @@ -18,6 +18,16 @@ MAX_TOKENS = 50000 #gpt-4-1106-preview DISALLOWED_SPECIAL = "<|endoftext|>" REPLACEMENT_TOKEN = "" +TOKENIZER_FALLBACKS = [ + ("gpt-5", "o200k_base"), + ("gpt-4o", "o200k_base"), + ("gpt-4.1", "o200k_base"), + ("gpt-4", "cl100k_base"), + ("gpt-3.5", "cl100k_base"), +] + +FINAL_TOKENIZER_FALLBACK = "o200k_base" + def run_git_command_with_retry(cmd, max_retries=1, delay=5, **kwargs): """ Run a git command with retry logic. @@ -59,8 +69,24 @@ def _sanitize(text: str) -> str: """ return text.replace(DISALLOWED_SPECIAL, REPLACEMENT_TOKEN) +def _get_encoding_for_model(model: str): + """ + Return a tokenizer for the requested model, with fallbacks for newer + model names that tiktoken may not recognize yet. + """ + try: + return tiktoken.encoding_for_model(model) + except KeyError: + lowered_model = model.lower() + for prefix, encoding_name in TOKENIZER_FALLBACKS: + if lowered_model.startswith(prefix): + print(f"Tokenizer for model {model} not found. Falling back to {encoding_name}.") + return tiktoken.get_encoding(encoding_name) + print(f"Tokenizer for model {model} not found. Falling back to {FINAL_TOKENIZER_FALLBACK}.") + return tiktoken.get_encoding(FINAL_TOKENIZER_FALLBACK) + def reportTokens(prompt, model): - encoding = tiktoken.encoding_for_model(model) + encoding = _get_encoding_for_model(model) # print number of tokens in light gray, with first 50 characters of prompt in green. if truncated, show that it is truncated #print("\033[37m" + str(len(encoding.encode(prompt))) + " tokens\033[0m" + " in prompt: " + "\033[92m" + prompt[:50] + "\033[0m" + ("..." if len(prompt) > 50 else "")) prompt = _sanitize(prompt) @@ -316,6 +342,7 @@ def copy_dirs(source_path, dest_path, folder_names): print(f"Error: {source_folder} does not exist.") else: # Copy the theme folder + os.makedirs(os.path.dirname(destination_folder.rstrip(os.sep)) or dest_path, exist_ok=True) shutil.copytree(source_folder, destination_folder) print(f"Copied {folder_name} folder from {source_folder} to {destination_folder}") @@ -326,6 +353,7 @@ def move_files_to_push(source_path, dest_path, relative_file_paths): if not os.path.exists(source_filepath): print(f"Error: {source_filepath} does not exist.") else: + os.makedirs(os.path.dirname(dest_filepath), exist_ok=True) shutil.copy2(source_filepath, dest_filepath) print(f"[+] Copied {file_path}")