diff --git a/.github/workflows/build_master.yml b/.github/workflows/build_master.yml
index 2693b4821..b0d92e7ea 100644
--- a/.github/workflows/build_master.yml
+++ b/.github/workflows/build_master.yml
@@ -34,6 +34,15 @@ jobs:
# Build the mdBook
- name: Build mdBook
run: MDBOOK_BOOK__LANGUAGE=en mdbook build || (echo "Error logs" && cat hacktricks-preprocessor-error.log && echo "" && echo "" && echo "Debug logs" && (cat hacktricks-preprocessor.log | tail -n 20) && exit 1)
+
+ - name: Post-process SEO artifacts
+ run: |
+ python3 scripts/seo_postprocess.py pages \
+ --book-dir ./book \
+ --site-url https://cloud.hacktricks.wiki \
+ --lang en \
+ --default-lang en \
+ --site-name "HackTricks Cloud"
- name: Push search index to hacktricks-searchindex repo
shell: bash
@@ -149,6 +158,15 @@ jobs:
- name: Sync to S3
run: aws s3 sync ./book s3://hacktricks-cloud/en --delete
+ - name: Upload root sitemap index
+ run: |
+ LANGS=$(aws s3api list-objects-v2 --bucket hacktricks-cloud --delimiter / --query 'CommonPrefixes[].Prefix' --output text | tr '\t' '\n' | sed 's:/$::' | grep -E '^[a-z]{2}$' | sort | paste -sd, -)
+ if [ -z "$LANGS" ]; then
+ LANGS="en"
+ fi
+ python3 scripts/seo_postprocess.py index --site-url https://cloud.hacktricks.wiki --languages "$LANGS" --output ./sitemap.xml
+ aws s3 cp ./sitemap.xml s3://hacktricks-cloud/sitemap.xml --content-type application/xml --cache-control max-age=300
+
- name: Upload root ads.txt
run: |
aws s3 cp ./ads.txt s3://hacktricks-cloud/ads.txt --content-type text/plain --cache-control max-age=300
diff --git a/.github/workflows/translate_all.yml b/.github/workflows/translate_all.yml
index cc6c178ed..49bd5d363 100644
--- a/.github/workflows/translate_all.yml
+++ b/.github/workflows/translate_all.yml
@@ -254,6 +254,15 @@ jobs:
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: us-east-1
+
+ - name: Post-process SEO artifacts
+ run: |
+ python3 scripts/seo_postprocess.py pages \
+ --book-dir ./book \
+ --site-url https://cloud.hacktricks.wiki \
+ --lang "$BRANCH" \
+ --default-lang en \
+ --site-name "HackTricks Cloud"
# Sync the build to S3
- name: Sync to S3
@@ -265,3 +274,12 @@ jobs:
echo "Sync completed"
echo "Cat 3 files from the book"
find . -type f -name 'index.html' -print | head -n 3 | xargs -r cat
+
+ - name: Refresh root sitemap index
+ run: |
+ LANGS=$(aws s3api list-objects-v2 --bucket hacktricks-cloud --delimiter / --query 'CommonPrefixes[].Prefix' --output text | tr '\t' '\n' | sed 's:/$::' | grep -E '^[a-z]{2}$' | sort | paste -sd, -)
+ if [ -z "$LANGS" ]; then
+ LANGS="en"
+ fi
+ python3 scripts/seo_postprocess.py index --site-url https://cloud.hacktricks.wiki --languages "$LANGS" --output ./sitemap.xml
+ aws s3 cp ./sitemap.xml s3://hacktricks-cloud/sitemap.xml --content-type application/xml --cache-control max-age=300
diff --git a/scripts/seo_postprocess.py b/scripts/seo_postprocess.py
new file mode 100644
index 000000000..5ec45b107
--- /dev/null
+++ b/scripts/seo_postprocess.py
@@ -0,0 +1,242 @@
+import argparse
+import html
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+import xml.etree.ElementTree as ET
+
+
+DEFAULT_LANGUAGES = [
+ "af",
+ "zh",
+ "es",
+ "en",
+ "fr",
+ "de",
+ "el",
+ "hi",
+ "it",
+ "ja",
+ "ko",
+ "pl",
+ "pt",
+ "sr",
+ "sw",
+ "tr",
+ "uk",
+]
+
+SKIP_HTML = {"404.html", "print.html", "toc.html"}
+SEO_START = ""
+SEO_END = ""
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ subparsers = parser.add_subparsers(dest="command", required=True)
+
+ pages = subparsers.add_parser("pages")
+ pages.add_argument("--book-dir", required=True)
+ pages.add_argument("--site-url", required=True)
+ pages.add_argument("--lang", required=True)
+ pages.add_argument("--default-lang", default="en")
+ pages.add_argument("--languages", default=",".join(DEFAULT_LANGUAGES))
+ pages.add_argument("--site-name", default="HackTricks Cloud")
+
+ index_cmd = subparsers.add_parser("index")
+ index_cmd.add_argument("--site-url", required=True)
+ index_cmd.add_argument("--languages", required=True)
+ index_cmd.add_argument("--output", required=True)
+
+ return parser.parse_args()
+
+
+def parse_languages(raw):
+ langs = []
+ for item in raw.split(","):
+ code = item.strip()
+ if re.fullmatch(r"[a-z]{2}", code):
+ langs.append(code)
+ return sorted(set(langs))
+
+
+def iter_html_files(book_dir):
+ for html_file in sorted(Path(book_dir).rglob("*.html")):
+ if html_file.name in SKIP_HTML:
+ continue
+ yield html_file
+
+
+def canonical_url(site_url, lang, rel_path):
+ return f"{site_url.rstrip('/')}/{lang}/{rel_path.as_posix()}"
+
+
+def clean_text(fragment):
+ fragment = re.sub(r"", " ", fragment, flags=re.I | re.S)
+ fragment = re.sub(r"", " ", fragment, flags=re.I | re.S)
+ fragment = re.sub(r"<[^>]+>", " ", fragment)
+ fragment = html.unescape(fragment)
+ fragment = re.sub(r"\s+", " ", fragment).strip()
+ return fragment
+
+
+def trim_description(text, fallback):
+ text = text or fallback
+ text = re.sub(r"\s+", " ", text).strip()
+ if len(text) <= 160:
+ return text
+ cut = text[:157]
+ if " " in cut:
+ cut = cut.rsplit(" ", 1)[0]
+ return cut + "..."
+
+
+def extract_description(document, fallback):
+ main_match = re.search(r"
]*>(.*?)
", r"