Add SEO post-processing for cloud wiki

2026-03-12 21:22:57 -07:00 · 2026-03-11 21:25:52 +01:00
parent 40b954c021
commit 6a9b95fe96
5 changed files with 282 additions and 4 deletions
--- a/.github/workflows/build_master.yml
+++ b/.github/workflows/build_master.yml
@@ -34,6 +34,15 @@ jobs:
      # Build the mdBook
      - name: Build mdBook
        run: MDBOOK_BOOK__LANGUAGE=en mdbook build || (echo "Error logs" && cat hacktricks-preprocessor-error.log && echo "" && echo "" && echo "Debug logs" && (cat hacktricks-preprocessor.log | tail -n 20) && exit 1)
      - name: Post-process SEO artifacts
        run: |
          python3 scripts/seo_postprocess.py pages \
            --book-dir ./book \
            --site-url https://cloud.hacktricks.wiki \
            --lang en \
            --default-lang en \
            --site-name "HackTricks Cloud"
      - name: Push search index to hacktricks-searchindex repo
        shell: bash
@@ -149,6 +158,15 @@ jobs:
      - name: Sync to S3
        run: aws s3 sync ./book s3://hacktricks-cloud/en --delete
      - name: Upload root sitemap index
        run: |
          LANGS=$(aws s3api list-objects-v2 --bucket hacktricks-cloud --delimiter / --query 'CommonPrefixes[].Prefix' --output text | tr '\t' '\n' | sed 's:/$::' | grep -E '^[a-z]{2}$' | sort | paste -sd, -)
          if [ -z "$LANGS" ]; then
            LANGS="en"
          fi
          python3 scripts/seo_postprocess.py index --site-url https://cloud.hacktricks.wiki --languages "$LANGS" --output ./sitemap.xml
          aws s3 cp ./sitemap.xml s3://hacktricks-cloud/sitemap.xml --content-type application/xml --cache-control max-age=300
      - name: Upload root ads.txt
        run: |
          aws s3 cp ./ads.txt s3://hacktricks-cloud/ads.txt --content-type text/plain --cache-control max-age=300
--- a/.github/workflows/translate_all.yml
+++ b/.github/workflows/translate_all.yml
@@ -254,6 +254,15 @@ jobs:
        with:
          role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
          aws-region: us-east-1
      - name: Post-process SEO artifacts
        run: |
          python3 scripts/seo_postprocess.py pages \
            --book-dir ./book \
            --site-url https://cloud.hacktricks.wiki \
            --lang "$BRANCH" \
            --default-lang en \
            --site-name "HackTricks Cloud"
      # Sync the build to S3
      - name: Sync to S3
@@ -265,3 +274,12 @@ jobs:
          echo "Sync completed"
          echo "Cat 3 files from the book"
          find . -type f -name 'index.html' -print | head -n 3 | xargs -r cat
      - name: Refresh root sitemap index
        run: |
          LANGS=$(aws s3api list-objects-v2 --bucket hacktricks-cloud --delimiter / --query 'CommonPrefixes[].Prefix' --output text | tr '\t' '\n' | sed 's:/$::' | grep -E '^[a-z]{2}$' | sort | paste -sd, -)
          if [ -z "$LANGS" ]; then
            LANGS="en"
          fi
          python3 scripts/seo_postprocess.py index --site-url https://cloud.hacktricks.wiki --languages "$LANGS" --output ./sitemap.xml
          aws s3 cp ./sitemap.xml s3://hacktricks-cloud/sitemap.xml --content-type application/xml --cache-control max-age=300
--- a/scripts/seo_postprocess.py
+++ b/scripts/seo_postprocess.py
@@ -0,0 +1,242 @@
 import argparse
 import html
 import re
 from datetime import datetime, timezone
 from pathlib import Path
 import xml.etree.ElementTree as ET
 DEFAULT_LANGUAGES = [
    "af",
    "zh",
    "es",
    "en",
    "fr",
    "de",
    "el",
    "hi",
    "it",
    "ja",
    "ko",
    "pl",
    "pt",
    "sr",
    "sw",
    "tr",
    "uk",
 ]
 SKIP_HTML = {"404.html", "print.html", "toc.html"}
 SEO_START = "<!-- HT_SEO_START -->"
 SEO_END = "<!-- HT_SEO_END -->"
 def parse_args():
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(dest="command", required=True)
    pages = subparsers.add_parser("pages")
    pages.add_argument("--book-dir", required=True)
    pages.add_argument("--site-url", required=True)
    pages.add_argument("--lang", required=True)
    pages.add_argument("--default-lang", default="en")
    pages.add_argument("--languages", default=",".join(DEFAULT_LANGUAGES))
    pages.add_argument("--site-name", default="HackTricks Cloud")
    index_cmd = subparsers.add_parser("index")
    index_cmd.add_argument("--site-url", required=True)
    index_cmd.add_argument("--languages", required=True)
    index_cmd.add_argument("--output", required=True)
    return parser.parse_args()
 def parse_languages(raw):
    langs = []
    for item in raw.split(","):
        code = item.strip()
        if re.fullmatch(r"[a-z]{2}", code):
            langs.append(code)
    return sorted(set(langs))
 def iter_html_files(book_dir):
    for html_file in sorted(Path(book_dir).rglob("*.html")):
        if html_file.name in SKIP_HTML:
            continue
        yield html_file
 def canonical_url(site_url, lang, rel_path):
    return f"{site_url.rstrip('/')}/{lang}/{rel_path.as_posix()}"
 def clean_text(fragment):
    fragment = re.sub(r"<script\b[^>]*>.*?</script>", " ", fragment, flags=re.I | re.S)
    fragment = re.sub(r"<style\b[^>]*>.*?</style>", " ", fragment, flags=re.I | re.S)
    fragment = re.sub(r"<[^>]+>", " ", fragment)
    fragment = html.unescape(fragment)
    fragment = re.sub(r"\s+", " ", fragment).strip()
    return fragment
 def trim_description(text, fallback):
    text = text or fallback
    text = re.sub(r"\s+", " ", text).strip()
    if len(text) <= 160:
        return text
    cut = text[:157]
    if " " in cut:
        cut = cut.rsplit(" ", 1)[0]
    return cut + "..."
 def extract_description(document, fallback):
    main_match = re.search(r"<main\b[^>]*>(.*?)</main>", document, flags=re.I | re.S)
    scope = main_match.group(1) if main_match else document
    for pattern in (r"<p\b[^>]*>(.*?)</p>", r"<li\b[^>]*>(.*?)</li>", r"<h[12]\b[^>]*>(.*?)</h[12]>"):
        for match in re.finditer(pattern, scope, flags=re.I | re.S):
            text = clean_text(match.group(1))
            if len(text) >= 40:
                return trim_description(text, fallback)
    return trim_description(clean_text(scope), fallback)
 def build_seo_block(site_url, lang, rel_path, languages, default_lang):
    current_url = canonical_url(site_url, lang, rel_path)
    lines = [SEO_START, f'<link rel="canonical" href="{html.escape(current_url, quote=True)}">']
    for alt_lang in languages:
        alt_url = canonical_url(site_url, alt_lang, rel_path)
        lines.append(
            f'<link rel="alternate" hreflang="{alt_lang}" href="{html.escape(alt_url, quote=True)}">'
        )
    default_url = canonical_url(site_url, default_lang, rel_path)
    lines.append(f'<link rel="alternate" hreflang="x-default" href="{html.escape(default_url, quote=True)}">')
    lines.append(SEO_END)
    return "\n        ".join(lines)
 def update_document(document, site_url, lang, rel_path, languages, default_lang, site_name):
    title_match = re.search(r"<title>(.*?)</title>", document, flags=re.I | re.S)
    page_title = clean_text(title_match.group(1)) if title_match else site_name
    fallback_description = f"{site_name}: {page_title}"
    description = extract_description(document, fallback_description)
    seo_block = build_seo_block(site_url, lang, rel_path, languages, default_lang)
    document = re.sub(
        r"\s*<!-- HT_SEO_START -->.*?<!-- HT_SEO_END -->\s*",
        "\n",
        document,
        flags=re.S,
    )
    if re.search(r'<meta\s+name="description"\s+content="[^"]*"\s*/?>', document, flags=re.I):
        document = re.sub(
            r'(<meta\s+name="description"\s+content=")[^"]*("\s*/?>)',
            r"\1" + html.escape(description, quote=True) + r"\2",
            document,
            count=1,
            flags=re.I,
        )
    elif title_match:
        document = document.replace(
            title_match.group(0),
            title_match.group(0) + f'\n        <meta name="description" content="{html.escape(description, quote=True)}">',
            1,
        )
    document = re.sub(r"</head>", f"        {seo_block}\n    </head>", document, count=1, flags=re.I)
    return document
 def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang):
    ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9")
    ET.register_namespace("xhtml", "http://www.w3.org/1999/xhtml")
    urlset = ET.Element("{http://www.sitemaps.org/schemas/sitemap/0.9}urlset")
    for html_file in iter_html_files(book_dir):
        rel_path = html_file.relative_to(book_dir)
        url = ET.SubElement(urlset, "{http://www.sitemaps.org/schemas/sitemap/0.9}url")
        ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = canonical_url(
            site_url, lang, rel_path
        )
        lastmod = datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat()
        ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = lastmod
        for alt_lang in languages:
            ET.SubElement(
                url,
                "{http://www.w3.org/1999/xhtml}link",
                {
                    "rel": "alternate",
                    "hreflang": alt_lang,
                    "href": canonical_url(site_url, alt_lang, rel_path),
                },
            )
        ET.SubElement(
            url,
            "{http://www.w3.org/1999/xhtml}link",
            {
                "rel": "alternate",
                "hreflang": "x-default",
                "href": canonical_url(site_url, default_lang, rel_path),
            },
        )
    tree = ET.ElementTree(urlset)
    output = Path(book_dir) / "sitemap.xml"
    tree.write(output, encoding="utf-8", xml_declaration=True)
 def process_pages(args):
    book_dir = Path(args.book_dir)
    languages = parse_languages(args.languages)
    for html_file in iter_html_files(book_dir):
        rel_path = html_file.relative_to(book_dir)
        content = html_file.read_text(encoding="utf-8")
        updated = update_document(
            content,
            args.site_url,
            args.lang,
            rel_path,
            languages,
            args.default_lang,
            args.site_name,
        )
        html_file.write_text(updated, encoding="utf-8")
    generate_language_sitemap(book_dir, args.site_url, args.lang, languages, args.default_lang)
 def generate_sitemap_index(args):
    ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9")
    sitemapindex = ET.Element("{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex")
    now = datetime.now(timezone.utc).date().isoformat()
    for lang in parse_languages(args.languages):
        sitemap = ET.SubElement(sitemapindex, "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap")
        ET.SubElement(sitemap, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = (
            f"{args.site_url.rstrip('/')}/{lang}/sitemap.xml"
        )
        ET.SubElement(sitemap, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = now
    ET.ElementTree(sitemapindex).write(args.output, encoding="utf-8", xml_declaration=True)
 def main():
    args = parse_args()
    if args.command == "pages":
        process_pages(args)
    elif args.command == "index":
        generate_sitemap_index(args)
 if __name__ == "__main__":
    main()
--- a/src/robots.txt
+++ b/src/robots.txt
@@ -1,4 +1,4 @@
-Sitemap: https://www.hacktricks.wiki/sitemap.xml
+Sitemap: https://cloud.hacktricks.wiki/sitemap.xml
 User-agent: *
-Disallow:
+Disallow:
--- a/theme/index.hbs
+++ b/theme/index.hbs
@@ -163,7 +163,7 @@
                                <a class="menu-bar-link" href="https://hacktricks-training.com" target="_blank">
                                    HT Training
                                </a>
-                                <a class="menu-bar-link" href="https://book.hacktricks.wiki/" target="_blank">
+                                <a class="menu-bar-link" href="https://hacktricks.wiki/" target="_blank">
                                    HT Book
                                </a>
                                <a class="menu-bar-link" href="https://tools.hacktricks.wiki/" target="_blank">
@@ -184,7 +184,7 @@
                                    <span class="menu-hamburger" aria-hidden="true">≡</span>
                                    <div id="menubar-collapse-popup" class="menubar-collapse-popup" aria-label="Menu" role="menu">
                                        <a href="https://hacktricks-training.com" target="_blank" role="menuitem" class="menu-bar-link">HT Training</a>
-                                        <a href="https://book.hacktricks.wiki/" target="_blank" role="menuitem" class="menu-bar-link">Book HT</a>
+                                        <a href="https://hacktricks.wiki/" target="_blank" role="menuitem" class="menu-bar-link">Book HT</a>
                                        <a href="https://tools.hacktricks.wiki/" target="_blank" role="menuitem" class="menu-bar-link">HT Tools</a>
                                        <a href="https://github.com/sponsors/carlospolop" target="_blank" role="menuitem" class="menu-bar-link">Sponsor</a>
                                        <a href="https://www.linkedin.com/company/hacktricks" target="_blank" role="menuitem" class="menu-bar-link">Linkedin</a>