Set social preview logo for cloud book

Add automatic social and structured SEO tags
Fix cloud searchindex retry working directory
2026-03-12 21:22:57 -07:00 · 2026-03-12 00:52:01 +01:00 · 2026-03-12 00:19:09 +01:00 · 2026-03-11 21:58:12 +01:00 · 2026-03-11 21:35:32 +01:00 · 2026-03-11 21:34:12 +01:00
5 changed files with 418 additions and 8 deletions
--- a/.github/workflows/build_master.yml
+++ b/.github/workflows/build_master.yml
@@ -5,9 +5,7 @@ on:
    branches:
      - master
    paths-ignore:
-      - 'scripts/**'
      - '.gitignore'
-      - '.github/**'
      - 'book/**'
  workflow_dispatch:

@@ -34,6 +32,15 @@ jobs:
      # Build the mdBook
      - name: Build mdBook
        run: MDBOOK_BOOK__LANGUAGE=en mdbook build || (echo "Error logs" && cat hacktricks-preprocessor-error.log && echo "" && echo "" && echo "Debug logs" && (cat hacktricks-preprocessor.log | tail -n 20) && exit 1)
+
+      - name: Post-process SEO artifacts
+        run: |
+          python3 scripts/seo_postprocess.py pages \
+            --book-dir ./book \
+            --site-url https://cloud.hacktricks.wiki \
+            --lang en \
+            --default-lang en \
+            --site-name "HackTricks Cloud"
      
      - name: Push search index to hacktricks-searchindex repo
        shell: bash
@@ -93,6 +100,7 @@ jobs:
            RETRY_COUNT=$((RETRY_COUNT + 1))
            echo "Push attempt ${RETRY_COUNT}/${MAX_RETRIES}"

+            cd "${GITHUB_WORKSPACE}"
            rm -rf /tmp/searchindex-repo /tmp/searchindex-backup
            git clone https://x-access-token:${TOKEN}@github.com/${TARGET_REPO}.git /tmp/searchindex-repo

@@ -149,6 +157,15 @@ jobs:
      - name: Sync to S3
        run: aws s3 sync ./book s3://hacktricks-cloud/en --delete

+      - name: Upload root sitemap index
+        run: |
+          LANGS=$(aws s3api list-objects-v2 --bucket hacktricks-cloud --delimiter / --query 'CommonPrefixes[].Prefix' --output text | tr '\t' '\n' | sed 's:/$::' | grep -E '^[a-z]{2}$' | sort | paste -sd, -)
+          if [ -z "$LANGS" ]; then
+            LANGS="en"
+          fi
+          python3 scripts/seo_postprocess.py index --site-url https://cloud.hacktricks.wiki --languages "$LANGS" --output ./sitemap.xml
+          aws s3 cp ./sitemap.xml s3://hacktricks-cloud/sitemap.xml --content-type application/xml --cache-control max-age=300
+
      - name: Upload root ads.txt
        run: |
          aws s3 cp ./ads.txt s3://hacktricks-cloud/ads.txt --content-type text/plain --cache-control max-age=300
@@ -158,4 +175,10 @@ jobs:
        run: |
          aws s3 cp ./src/robots.txt s3://hacktricks-cloud/robots.txt --content-type text/plain --cache-control max-age=300
          aws s3 cp ./src/robots.txt s3://hacktricks-cloud/en/robots.txt --content-type text/plain --cache-control max-age=300
+
+      - name: Invalidate CloudFront HTML and SEO assets
+        run: |
+          aws cloudfront create-invalidation \
+            --distribution-id "${{ secrets.CLOUDFRONT_DISTRIBUTION_ID }}" \
+            --paths "/en/*" "/robots.txt" "/en/robots.txt" "/sitemap.xml" "/en/sitemap.xml"
      
--- a/.github/workflows/translate_all.yml
+++ b/.github/workflows/translate_all.yml
@@ -5,9 +5,7 @@ on:
    branches:
      - master
    paths-ignore:
-      - 'scripts/**'
      - '.gitignore'
-      - '.github/**'
      - Dockerfile
  workflow_dispatch:

@@ -82,6 +80,7 @@ jobs:
          wget -O /tmp/get_and_save_refs.py https://raw.githubusercontent.com/HackTricks-wiki/hacktricks-cloud/master/scripts/get_and_save_refs.py
          wget -O /tmp/compare_and_fix_refs.py https://raw.githubusercontent.com/HackTricks-wiki/hacktricks-cloud/master/scripts/compare_and_fix_refs.py
          wget -O /tmp/translator.py https://raw.githubusercontent.com/HackTricks-wiki/hacktricks-cloud/master/scripts/translator.py
+          wget -O /tmp/seo_postprocess.py https://raw.githubusercontent.com/HackTricks-wiki/hacktricks-cloud/master/scripts/seo_postprocess.py

      - name: Run get_and_save_refs.py
        run: |
@@ -254,6 +253,15 @@ jobs:
        with:
          role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
          aws-region: us-east-1
+
+      - name: Post-process SEO artifacts
+        run: |
+          python3 /tmp/seo_postprocess.py pages \
+            --book-dir ./book \
+            --site-url https://cloud.hacktricks.wiki \
+            --lang "$BRANCH" \
+            --default-lang en \
+            --site-name "HackTricks Cloud"
      
      # Sync the build to S3
      - name: Sync to S3
@@ -265,3 +273,18 @@ jobs:
          echo "Sync completed"
          echo "Cat 3 files from the book"
          find . -type f -name 'index.html' -print | head -n 3 | xargs -r cat
+
+      - name: Refresh root sitemap index
+        run: |
+          LANGS=$(aws s3api list-objects-v2 --bucket hacktricks-cloud --delimiter / --query 'CommonPrefixes[].Prefix' --output text | tr '\t' '\n' | sed 's:/$::' | grep -E '^[a-z]{2}$' | sort | paste -sd, -)
+          if [ -z "$LANGS" ]; then
+            LANGS="en"
+          fi
+          python3 /tmp/seo_postprocess.py index --site-url https://cloud.hacktricks.wiki --languages "$LANGS" --output ./sitemap.xml
+          aws s3 cp ./sitemap.xml s3://hacktricks-cloud/sitemap.xml --content-type application/xml --cache-control max-age=300
+
+      - name: Invalidate CloudFront HTML and SEO assets
+        run: |
+          aws cloudfront create-invalidation \
+            --distribution-id "${{ secrets.CLOUDFRONT_DISTRIBUTION_ID }}" \
+            --paths "/$BRANCH/*" "/robots.txt" "/sitemap.xml" "/$BRANCH/sitemap.xml"
--- a/scripts/seo_postprocess.py
+++ b/scripts/seo_postprocess.py
@@ -0,0 +1,364 @@
+import argparse
+import html
+import json
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+import xml.etree.ElementTree as ET
+
+
+DEFAULT_LANGUAGES = [
+    "af",
+    "zh",
+    "es",
+    "en",
+    "fr",
+    "de",
+    "el",
+    "hi",
+    "it",
+    "ja",
+    "ko",
+    "pl",
+    "pt",
+    "sr",
+    "sw",
+    "tr",
+    "uk",
+]
+
+SKIP_HTML = {"404.html", "print.html", "toc.html"}
+SEO_START = "<!-- HT_SEO_START -->"
+SEO_END = "<!-- HT_SEO_END -->"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    pages = subparsers.add_parser("pages")
+    pages.add_argument("--book-dir", required=True)
+    pages.add_argument("--site-url", required=True)
+    pages.add_argument("--lang", required=True)
+    pages.add_argument("--default-lang", default="en")
+    pages.add_argument("--languages", default=",".join(DEFAULT_LANGUAGES))
+    pages.add_argument("--site-name", default="HackTricks Cloud")
+
+    index_cmd = subparsers.add_parser("index")
+    index_cmd.add_argument("--site-url", required=True)
+    index_cmd.add_argument("--languages", required=True)
+    index_cmd.add_argument("--output", required=True)
+
+    return parser.parse_args()
+
+
+def parse_languages(raw):
+    langs = []
+    for item in raw.split(","):
+        code = item.strip()
+        if re.fullmatch(r"[a-z]{2}", code):
+            langs.append(code)
+    return sorted(set(langs))
+
+
+def iter_html_files(book_dir):
+    for html_file in sorted(Path(book_dir).rglob("*.html")):
+        if html_file.name in SKIP_HTML:
+            continue
+        yield html_file
+
+
+def canonical_url(site_url, lang, rel_path):
+    return f"{site_url.rstrip('/')}/{lang}/{rel_path.as_posix()}"
+
+
+def asset_url(site_url, lang, asset_path):
+    return f"{site_url.rstrip('/')}/{lang}/{asset_path.lstrip('/')}"
+
+
+def social_image_url(site_url):
+    return f"{site_url.rstrip('/')}/en/images/CLOUD-logo-letters.svg"
+
+
+def clean_text(fragment):
+    fragment = re.sub(r"<script\b[^>]*>.*?</script>", " ", fragment, flags=re.I | re.S)
+    fragment = re.sub(r"<style\b[^>]*>.*?</style>", " ", fragment, flags=re.I | re.S)
+    fragment = re.sub(r"<[^>]+>", " ", fragment)
+    fragment = html.unescape(fragment)
+    fragment = re.sub(r"\s+", " ", fragment).strip()
+    return fragment
+
+
+def trim_description(text, fallback):
+    text = text or fallback
+    text = re.sub(r"\s+", " ", text).strip()
+    if len(text) <= 160:
+        return text
+    cut = text[:157]
+    if " " in cut:
+        cut = cut.rsplit(" ", 1)[0]
+    return cut + "..."
+
+
+def extract_description(document, fallback):
+    main_match = re.search(r"<main\b[^>]*>(.*?)</main>", document, flags=re.I | re.S)
+    scope = main_match.group(1) if main_match else document
+
+    for pattern in (r"<p\b[^>]*>(.*?)</p>", r"<li\b[^>]*>(.*?)</li>", r"<h[12]\b[^>]*>(.*?)</h[12]>"):
+        for match in re.finditer(pattern, scope, flags=re.I | re.S):
+            text = clean_text(match.group(1))
+            if len(text) >= 40:
+                return trim_description(text, fallback)
+
+    return trim_description(clean_text(scope), fallback)
+
+
+def strip_index_suffix(path):
+    return re.sub(r"(?:^|/)index\.html$", "", path.as_posix())
+
+
+def is_homepage(rel_path):
+    return rel_path.as_posix() == "index.html"
+
+
+def humanize_slug(value):
+    value = value.replace(".html", "").replace("-", " ").replace("_", " ").strip()
+    value = re.sub(r"\s+", " ", value)
+    return value.title() if value else "Home"
+
+
+def breadcrumb_items(site_url, lang, rel_path):
+    items = [{"name": "Home", "url": canonical_url(site_url, lang, Path("index.html"))}]
+    bare_path = strip_index_suffix(rel_path)
+    if not bare_path:
+        return items
+
+    parts = [part for part in bare_path.split("/") if part]
+    for idx in range(len(parts)):
+        crumb_rel = Path(*parts[: idx + 1], "index.html")
+        items.append({"name": humanize_slug(parts[idx]), "url": canonical_url(site_url, lang, crumb_rel)})
+    return items
+
+
+def build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url):
+    current_url = canonical_url(site_url, lang, rel_path)
+    site_root = site_url.rstrip("/")
+    website_url = canonical_url(site_url, "en", Path("index.html"))
+    data = [
+        {
+            "@context": "https://schema.org",
+            "@type": "Organization",
+            "@id": f"{site_root}/#organization",
+            "name": site_name,
+            "url": site_root,
+            "logo": {"@type": "ImageObject", "url": image_url},
+        },
+        {
+            "@context": "https://schema.org",
+            "@type": "WebSite",
+            "@id": f"{site_root}/#website",
+            "url": site_root,
+            "name": site_name,
+            "inLanguage": "en",
+            "publisher": {"@id": f"{site_root}/#organization"},
+        },
+        {
+            "@context": "https://schema.org",
+            "@type": "WebPage",
+            "@id": f"{current_url}#webpage",
+            "url": current_url,
+            "name": title,
+            "description": description,
+            "inLanguage": lang,
+            "isPartOf": {"@id": f"{site_root}/#website"},
+            "about": {"@id": f"{site_root}/#organization"},
+            "primaryImageOfPage": {"@type": "ImageObject", "url": image_url},
+        },
+        {
+            "@context": "https://schema.org",
+            "@type": "BreadcrumbList",
+            "itemListElement": [
+                {
+                    "@type": "ListItem",
+                    "position": index,
+                    "name": item["name"],
+                    "item": item["url"],
+                }
+                for index, item in enumerate(breadcrumb_items(site_url, lang, rel_path), start=1)
+            ],
+        },
+    ]
+
+    if is_homepage(rel_path):
+        data[1]["potentialAction"] = {
+            "@type": "SearchAction",
+            "target": f"{website_url}?search={{search_term_string}}",
+            "query-input": "required name=search_term_string",
+        }
+
+    return data
+
+
+def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, description, site_name):
+    current_url = canonical_url(site_url, lang, rel_path)
+    image_url = social_image_url(site_url)
+    structured_data = json.dumps(
+        build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url),
+        ensure_ascii=False,
+        separators=(",", ":"),
+    )
+    lines = [SEO_START, f'<link rel="canonical" href="{html.escape(current_url, quote=True)}">']
+
+    for alt_lang in languages:
+        alt_url = canonical_url(site_url, alt_lang, rel_path)
+        lines.append(
+            f'<link rel="alternate" hreflang="{alt_lang}" href="{html.escape(alt_url, quote=True)}">'
+        )
+
+    default_url = canonical_url(site_url, default_lang, rel_path)
+    lines.append(f'<link rel="alternate" hreflang="x-default" href="{html.escape(default_url, quote=True)}">')
+    lines.extend(
+        [
+            f'<meta property="og:site_name" content="{html.escape(site_name, quote=True)}">',
+            f'<meta property="og:title" content="{html.escape(title, quote=True)}">',
+            f'<meta property="og:description" content="{html.escape(description, quote=True)}">',
+            f'<meta property="og:url" content="{html.escape(current_url, quote=True)}">',
+            f'<meta property="og:type" content="website">',
+            f'<meta property="og:image" content="{html.escape(image_url, quote=True)}">',
+            f'<meta property="og:image:secure_url" content="{html.escape(image_url, quote=True)}">',
+            f'<meta property="og:image:type" content="image/svg+xml">',
+            f'<meta property="og:image:alt" content="{html.escape(site_name, quote=True)}">',
+            f'<meta property="og:locale" content="{html.escape(lang, quote=True)}">',
+            f'<meta name="twitter:card" content="summary_large_image">',
+            f'<meta name="twitter:title" content="{html.escape(title, quote=True)}">',
+            f'<meta name="twitter:description" content="{html.escape(description, quote=True)}">',
+            f'<meta name="twitter:image" content="{html.escape(image_url, quote=True)}">',
+            '<script type="application/ld+json">' + structured_data + "</script>",
+        ]
+    )
+    lines.append(SEO_END)
+    return "\n        ".join(lines)
+
+
+def update_document(document, site_url, lang, rel_path, languages, default_lang, site_name):
+    title_match = re.search(r"<title>(.*?)</title>", document, flags=re.I | re.S)
+    page_title = clean_text(title_match.group(1)) if title_match else site_name
+    fallback_description = f"{site_name}: {page_title}"
+    description = extract_description(document, fallback_description)
+    seo_block = build_seo_block(
+        site_url, lang, rel_path, languages, default_lang, page_title, description, site_name
+    )
+
+    document = re.sub(
+        r"\s*<!-- HT_SEO_START -->.*?<!-- HT_SEO_END -->\s*",
+        "\n",
+        document,
+        flags=re.S,
+    )
+
+    if re.search(r'<meta\s+name="description"\s+content="[^"]*"\s*/?>', document, flags=re.I):
+        document = re.sub(
+            r'(<meta\s+name="description"\s+content=")[^"]*("\s*/?>)',
+            r"\1" + html.escape(description, quote=True) + r"\2",
+            document,
+            count=1,
+            flags=re.I,
+        )
+    elif title_match:
+        document = document.replace(
+            title_match.group(0),
+            title_match.group(0) + f'\n        <meta name="description" content="{html.escape(description, quote=True)}">',
+            1,
+        )
+
+    document = re.sub(r"</head>", f"        {seo_block}\n    </head>", document, count=1, flags=re.I)
+    return document
+
+
+def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang):
+    ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9")
+    ET.register_namespace("xhtml", "http://www.w3.org/1999/xhtml")
+
+    urlset = ET.Element("{http://www.sitemaps.org/schemas/sitemap/0.9}urlset")
+
+    for html_file in iter_html_files(book_dir):
+        rel_path = html_file.relative_to(book_dir)
+        url = ET.SubElement(urlset, "{http://www.sitemaps.org/schemas/sitemap/0.9}url")
+        ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = canonical_url(
+            site_url, lang, rel_path
+        )
+        lastmod = datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat()
+        ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = lastmod
+
+        for alt_lang in languages:
+            ET.SubElement(
+                url,
+                "{http://www.w3.org/1999/xhtml}link",
+                {
+                    "rel": "alternate",
+                    "hreflang": alt_lang,
+                    "href": canonical_url(site_url, alt_lang, rel_path),
+                },
+            )
+
+        ET.SubElement(
+            url,
+            "{http://www.w3.org/1999/xhtml}link",
+            {
+                "rel": "alternate",
+                "hreflang": "x-default",
+                "href": canonical_url(site_url, default_lang, rel_path),
+            },
+        )
+
+    tree = ET.ElementTree(urlset)
+    output = Path(book_dir) / "sitemap.xml"
+    tree.write(output, encoding="utf-8", xml_declaration=True)
+
+
+def process_pages(args):
+    book_dir = Path(args.book_dir)
+    languages = parse_languages(args.languages)
+
+    for html_file in iter_html_files(book_dir):
+        rel_path = html_file.relative_to(book_dir)
+        content = html_file.read_text(encoding="utf-8")
+        updated = update_document(
+            content,
+            args.site_url,
+            args.lang,
+            rel_path,
+            languages,
+            args.default_lang,
+            args.site_name,
+        )
+        html_file.write_text(updated, encoding="utf-8")
+
+    generate_language_sitemap(book_dir, args.site_url, args.lang, languages, args.default_lang)
+
+
+def generate_sitemap_index(args):
+    ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9")
+    sitemapindex = ET.Element("{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex")
+    now = datetime.now(timezone.utc).date().isoformat()
+
+    for lang in parse_languages(args.languages):
+        sitemap = ET.SubElement(sitemapindex, "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap")
+        ET.SubElement(sitemap, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = (
+            f"{args.site_url.rstrip('/')}/{lang}/sitemap.xml"
+        )
+        ET.SubElement(sitemap, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = now
+
+    ET.ElementTree(sitemapindex).write(args.output, encoding="utf-8", xml_declaration=True)
+
+
+def main():
+    args = parse_args()
+    if args.command == "pages":
+        process_pages(args)
+    elif args.command == "index":
+        generate_sitemap_index(args)
+
+
+if __name__ == "__main__":
+    main()
--- a/src/robots.txt
+++ b/src/robots.txt
@@ -1,4 +1,4 @@
-Sitemap: https://www.hacktricks.wiki/sitemap.xml
+Sitemap: https://cloud.hacktricks.wiki/sitemap.xml

 User-agent: *
-Disallow:
+Disallow:
--- a/theme/index.hbs
+++ b/theme/index.hbs
@@ -163,7 +163,7 @@
                                <a class="menu-bar-link" href="https://hacktricks-training.com" target="_blank">
                                    HT Training
                                </a>
-                                <a class="menu-bar-link" href="https://book.hacktricks.wiki/" target="_blank">
+                                <a class="menu-bar-link" href="https://hacktricks.wiki/" target="_blank">
                                    HT Book
                                </a>
                                <a class="menu-bar-link" href="https://tools.hacktricks.wiki/" target="_blank">
@@ -184,7 +184,7 @@
                                    <span class="menu-hamburger" aria-hidden="true">≡</span>
                                    <div id="menubar-collapse-popup" class="menubar-collapse-popup" aria-label="Menu" role="menu">
                                        <a href="https://hacktricks-training.com" target="_blank" role="menuitem" class="menu-bar-link">HT Training</a>
-                                        <a href="https://book.hacktricks.wiki/" target="_blank" role="menuitem" class="menu-bar-link">Book HT</a>
+                                        <a href="https://hacktricks.wiki/" target="_blank" role="menuitem" class="menu-bar-link">Book HT</a>
                                        <a href="https://tools.hacktricks.wiki/" target="_blank" role="menuitem" class="menu-bar-link">HT Tools</a>
                                        <a href="https://github.com/sponsors/carlospolop" target="_blank" role="menuitem" class="menu-bar-link">Sponsor</a>
                                        <a href="https://www.linkedin.com/company/hacktricks" target="_blank" role="menuitem" class="menu-bar-link">Linkedin</a>
Author	SHA1	Message	Date
Carlos Polop	8161b74c38	Set social preview logo for cloud book	2026-03-12 00:52:01 +01:00
Carlos Polop	dc3df85e47	Add automatic social and structured SEO tags	2026-03-12 00:19:09 +01:00
Carlos Polop	48514dbd94	Fix cloud searchindex retry working directory	2026-03-11 21:58:12 +01:00
Carlos Polop	67584e84e5	Trigger deploys for workflow and SEO script changes	2026-03-11 21:35:32 +01:00
Carlos Polop	234bd83229	Invalidate SEO deploys and fix translation script path	2026-03-11 21:34:12 +01:00
Carlos Polop	6a9b95fe96	Add SEO post-processing for cloud wiki	2026-03-11 21:25:52 +01:00