Files
hacktricks-cloud/scripts/clean_for_ai.py
Carlos Polop 299fe36f36 a
2024-12-19 00:48:15 +01:00

146 lines
5.5 KiB
Python

import os
import re
import tempfile
def clean_and_merge_md_files(start_folder, exclude_keywords, output_file):
def clean_file_content(file_path):
"""Clean the content of a single file and return the cleaned lines."""
with open(file_path, "r", encoding="utf-8") as f:
content = f.readlines()
cleaned_lines = []
inside_hint = False
for i,line in enumerate(content):
# Skip lines containing excluded keywords
if any(keyword in line for keyword in exclude_keywords):
continue
# Detect and skip {% hint %} ... {% endhint %} blocks
if "{% hint style=\"success\" %}" in line and "Learn & practice" in content[i+1]:
inside_hint = True
if "{% endhint %}" in line:
inside_hint = False
continue
if inside_hint:
continue
# Skip lines with <figure> ... </figure>
if re.match(r"<figure>.*?</figure>", line):
continue
# Add the line if it passed all checks
cleaned_lines.append(line.rstrip())
# Remove excess consecutive empty lines
cleaned_lines = remove_consecutive_empty_lines(cleaned_lines)
return cleaned_lines
def remove_consecutive_empty_lines(lines):
"""Allow no more than one consecutive empty line."""
cleaned_lines = []
previous_line_empty = False
for line in lines:
if line.strip() == "":
if not previous_line_empty:
cleaned_lines.append("")
previous_line_empty = True
else:
cleaned_lines.append(line)
previous_line_empty = False
return cleaned_lines
def gather_files_in_order(start_folder):
"""Gather all .md files in a depth-first order."""
files = []
for root, _, filenames in os.walk(start_folder):
md_files = sorted([os.path.join(root, f) for f in filenames if f.endswith(".md")])
files.extend(md_files)
return files
# Gather files in depth-first order
all_files = gather_files_in_order(start_folder)
# Process files and merge into a single output
with open(output_file, "w", encoding="utf-8") as output:
for file_path in all_files:
# Clean the content of the file
cleaned_content = clean_file_content(file_path)
# Skip saving if the cleaned file has fewer than 10 non-empty lines
if len([line for line in cleaned_content if line.strip()]) < 10:
continue
# Get the name of the file for the header
file_name = os.path.basename(file_path)
# Write header, cleaned content, and 2 extra new lines
output.write(f"# {file_name}\n\n")
output.write("\n".join(cleaned_content))
output.write("\n\n")
def main():
# Specify the starting folder and output file
start_folder = os.getcwd()
output_file = os.path.join(tempfile.gettempdir(), "merged_output.md")
# Keywords to exclude from lines
exclude_keywords = [
"STM Cyber", # STM Cyber ads
"offer several valuable cybersecurity services", # STM Cyber ads
"and hack the unhackable", # STM Cyber ads
"blog.stmcyber.com", # STM Cyber ads
"RootedCON", # RootedCON ads
"rootedcon.com", # RootedCON ads
"the mission of promoting technical knowledge", # RootedCON ads
"Intigriti", # Intigriti ads
"intigriti.com", # Intigriti ads
"Trickest", # Trickest ads
"trickest.com", # Trickest ads,
"Get Access Today:",
"HACKENPROOF", # Hackenproof ads
"hackenproof.com", # Hackenproof ads
"HackenProof", # Hackenproof ads
"discord.com/invite/N3FrSbmwdy", # Hackenproof ads
"Hacking Insights:", # Hackenproof ads
"Engage with content that delves", # Hackenproof ads
"Real-Time Hack News:", # Hackenproof ads
"Keep up-to-date with fast-paced", # Hackenproof ads
"Latest Announcements:", # Hackenproof ads
"Stay informed with the newest bug", # Hackenproof ads
"start collaborating with top hackers today!", # Hackenproof ads
"discord.com/invite/N3FrSbmwdy", # Hackenproof ads
"Pentest-Tools", # Pentest-Tools.com ads
"pentest-tools.com", # Pentest-Tools.com ads
"perspective on your web apps, network, and", # Pentest-Tools.com ads
"report critical, exploitable vulnerabilities with real business impact", # Pentest-Tools.com ads
"SerpApi", # SerpApi ads
"serpapi.com", # SerpApi ads
"offers fast and easy real-time", # SerpApi ads
"plans includes access to over 50 different APIs for scraping", # SerpApi ads
"8kSec", # 8kSec ads
"academy.8ksec.io", # 8kSec ads
"Learn the technologies and skills required", # 8kSec ads
"WebSec", # WebSec ads
"websec.nl", # WebSec ads
"which means they do it all; Pentesting", # WebSec ads
]
# Clean and merge .md files
clean_and_merge_md_files(start_folder, exclude_keywords, output_file)
# Print the path to the output file
print(f"Merged content has been saved to: {output_file}")
if __name__ == "__main__":
# Execute this from the hacktricks folder to clean
# It will clean all the .md files and compile them into 1 in a proper order
main()