diff --git a/scripts/clean_for_ai b/scripts/clean_for_ai new file mode 100644 index 000000000..412b39e85 --- /dev/null +++ b/scripts/clean_for_ai @@ -0,0 +1,102 @@ +import os +import re +import tempfile + +def clean_and_merge_md_files(start_folder, exclude_keywords, output_file): + def clean_file_content(file_path): + """Clean the content of a single file and return the cleaned lines.""" + with open(file_path, "r", encoding="utf-8") as f: + content = f.readlines() + + cleaned_lines = [] + inside_hint = False + for line in content: + # Skip lines containing excluded keywords + if any(keyword in line for keyword in exclude_keywords): + continue + + # Detect and skip {% hint %} ... {% endhint %} blocks + if "{% hint style=\"success\" %}" in line: + inside_hint = True + if "{% endhint %}" in line: + inside_hint = False + continue + if inside_hint: + continue + + # Skip lines with
...
+ if re.match(r"
.*?
", line): + continue + + # Add the line if it passed all checks + cleaned_lines.append(line.rstrip()) + + # Remove excess consecutive empty lines + cleaned_lines = remove_consecutive_empty_lines(cleaned_lines) + return cleaned_lines + + def remove_consecutive_empty_lines(lines): + """Allow no more than one consecutive empty line.""" + cleaned_lines = [] + previous_line_empty = False + for line in lines: + if line.strip() == "": + if not previous_line_empty: + cleaned_lines.append("") + previous_line_empty = True + else: + cleaned_lines.append(line) + previous_line_empty = False + return cleaned_lines + + def gather_files_in_order(start_folder): + """Gather all .md files in a depth-first order.""" + files = [] + for root, _, filenames in os.walk(start_folder): + md_files = sorted([os.path.join(root, f) for f in filenames if f.endswith(".md")]) + files.extend(md_files) + return files + + # Gather files in depth-first order + all_files = gather_files_in_order(start_folder) + + # Process files and merge into a single output + with open(output_file, "w", encoding="utf-8") as output: + for file_path in all_files: + # Clean the content of the file + cleaned_content = clean_file_content(file_path) + + # Skip saving if the cleaned file has fewer than 10 non-empty lines + if len([line for line in cleaned_content if line.strip()]) < 10: + continue + + # Get the name of the file for the header + file_name = os.path.basename(file_path) + + # Write header, cleaned content, and 2 extra new lines + output.write(f"# {file_name}\n\n") + output.write("\n".join(cleaned_content)) + output.write("\n\n") + +def main(): + # Specify the starting folder and output file + start_folder = os.getcwd() + output_file = os.path.join(tempfile.gettempdir(), "merged_output.md") + + # Keywords to exclude from lines + exclude_keywords = [ + "Keyword1", # Replace with your keywords + "Keyword2", + "HackTricks", # Example + ] + + # Clean and merge .md files + clean_and_merge_md_files(start_folder, exclude_keywords, output_file) + + # Print the path to the output file + print(f"Merged content has been saved to: {output_file}") + +if __name__ == "__main__": + # Execute this from the hacktricks folder to clean + # It will clean all the .md files and compile them into 1 in a proper order + main()