Improve SEO metadata and translation discovery

2026-07-28 22:51:09 -07:00 · 2026-05-09 17:34:28 +02:00
parent d13c270d7f
commit 39db3878cf
3 changed files with 167 additions and 32 deletions
@@ -2,6 +2,7 @@ import argparse
 import html
 import json
 import re
+import subprocess
 from datetime import datetime, timezone
 from pathlib import Path
 import xml.etree.ElementTree as ET
@@ -28,8 +29,45 @@ DEFAULT_LANGUAGES = [
 ]

 SKIP_HTML = {"404.html", "print.html", "toc.html"}
+NOINDEX_HTML = {"404.html", "print.html", "toc.html"}
 SEO_START = "<!-- HT_SEO_START -->"
 SEO_END = "<!-- HT_SEO_END -->"
+SITE_DESCRIPTIONS = {
+    "HackTricks Cloud": (
+        "HackTricks Cloud is a practical cloud security knowledge base covering AWS, GCP, Azure, "
+        "Kubernetes, CI/CD, and workspace pentesting techniques."
+    ),
+}
+LANGUAGE_LOCALES = {
+    "af": "af_ZA",
+    "de": "de_DE",
+    "el": "el_GR",
+    "en": "en_US",
+    "es": "es_ES",
+    "fr": "fr_FR",
+    "hi": "hi_IN",
+    "it": "it_IT",
+    "ja": "ja_JP",
+    "ko": "ko_KR",
+    "pl": "pl_PL",
+    "pt": "pt_PT",
+    "sr": "sr_RS",
+    "sw": "sw_KE",
+    "tr": "tr_TR",
+    "uk": "uk_UA",
+    "zh": "zh_CN",
+}
+DESCRIPTION_SKIP_PATTERNS = (
+    "hacktricks logos",
+    "learn & practice",
+    "learn and practice",
+    "hacktricks training",
+    "full hacktricks training catalog",
+)
+LANGUAGE_MENU_LINK_RE = re.compile(
+    r'(?P<prefix><a\s+id="(?P<lang>[a-z]{2})"\s+href=")/[a-z]{2}/[^"]*(?P<suffix>"\s+hreflang="(?P=lang)")',
+    flags=re.I,
+)


 def parse_args():
@@ -46,17 +84,18 @@ def parse_args():

    index_cmd = subparsers.add_parser("index")
    index_cmd.add_argument("--site-url", required=True)
-    index_cmd.add_argument("--languages", required=True)
+    index_cmd.add_argument("--languages", default=",".join(DEFAULT_LANGUAGES))
    index_cmd.add_argument("--output", required=True)

    return parser.parse_args()


 def parse_languages(raw):
+    supported = set(DEFAULT_LANGUAGES)
    langs = []
    for item in raw.split(","):
        code = item.strip()
-        if re.fullmatch(r"[a-z]{2}", code):
+        if re.fullmatch(r"[a-z]{2}", code) and code in supported:
            langs.append(code)
    return sorted(set(langs))

@@ -100,6 +139,13 @@ def trim_description(text, fallback):
    return cut + "..."


+def is_low_value_description(text):
+    lowered = text.lower()
+    if any(pattern in lowered for pattern in DESCRIPTION_SKIP_PATTERNS):
+        return True
+    return len(re.sub(r"[\W_]+", "", text)) < 40
+
+
 def extract_description(document, fallback):
    main_match = re.search(r"<main\b[^>]*>(.*?)</main>", document, flags=re.I | re.S)
    scope = main_match.group(1) if main_match else document
@@ -107,12 +153,16 @@ def extract_description(document, fallback):
    for pattern in (r"<p\b[^>]*>(.*?)</p>", r"<li\b[^>]*>(.*?)</li>", r"<h[12]\b[^>]*>(.*?)</h[12]>"):
        for match in re.finditer(pattern, scope, flags=re.I | re.S):
            text = clean_text(match.group(1))
-            if len(text) >= 40:
+            if len(text) >= 40 and not is_low_value_description(text):
                return trim_description(text, fallback)

    return trim_description(clean_text(scope), fallback)


+def homepage_description(site_name):
+    return SITE_DESCRIPTIONS.get(site_name, f"{site_name}: practical cloud security guides and references.")
+
+
 def strip_index_suffix(path):
    return re.sub(r"(?:^|/)index\.html$", "", path.as_posix())

@@ -121,6 +171,38 @@ def is_homepage(rel_path):
    return rel_path.as_posix() == "index.html"


+def source_path_for_html(book_dir, rel_path):
+    repo_root = Path(book_dir).resolve().parent
+    if rel_path.name == "index.html":
+        source_rel = Path("src") / rel_path.parent / "README.md"
+    else:
+        source_rel = Path("src") / rel_path.with_suffix(".md")
+    source_path = repo_root / source_rel
+    return source_path if source_path.exists() else None
+
+
+def git_lastmod(source_path):
+    try:
+        output = subprocess.check_output(
+            ["git", "log", "-1", "--format=%cs", "--", str(source_path)],
+            cwd=source_path.parent,
+            stderr=subprocess.DEVNULL,
+            text=True,
+        ).strip()
+    except (OSError, subprocess.CalledProcessError):
+        return None
+    return output or None
+
+
+def page_lastmod(book_dir, rel_path, html_file):
+    source_path = source_path_for_html(book_dir, rel_path)
+    if source_path:
+        lastmod = git_lastmod(source_path)
+        if lastmod:
+            return lastmod
+    return datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat()
+
+
 def humanize_slug(value):
    value = value.replace(".html", "").replace("-", " ").replace("_", " ").strip()
    value = re.sub(r"\s+", " ", value)
@@ -140,7 +222,7 @@ def breadcrumb_items(site_url, lang, rel_path):
    return items


-def build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url):
+def build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url, languages):
    current_url = canonical_url(site_url, lang, rel_path)
    site_root = site_url.rstrip("/")
    website_url = canonical_url(site_url, "en", Path("index.html"))
@@ -159,7 +241,7 @@ def build_structured_data(site_url, lang, rel_path, title, description, site_nam
            "@id": f"{site_root}/#website",
            "url": site_root,
            "name": site_name,
-            "inLanguage": "en",
+            "inLanguage": languages,
            "publisher": {"@id": f"{site_root}/#organization"},
        },
        {
@@ -203,7 +285,7 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de
    current_url = canonical_url(site_url, lang, rel_path)
    image_url = social_image_url(site_url)
    structured_data = json.dumps(
-        build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url),
+        build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url, languages),
        ensure_ascii=False,
        separators=(",", ":"),
    )
@@ -228,7 +310,7 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de
            f'<meta property="og:image:secure_url" content="{html.escape(image_url, quote=True)}">',
            f'<meta property="og:image:type" content="image/svg+xml">',
            f'<meta property="og:image:alt" content="{html.escape(site_name, quote=True)}">',
-            f'<meta property="og:locale" content="{html.escape(lang, quote=True)}">',
+            f'<meta property="og:locale" content="{html.escape(LANGUAGE_LOCALES.get(lang, lang), quote=True)}">',
            f'<meta name="twitter:card" content="summary_large_image">',
            f'<meta name="twitter:title" content="{html.escape(title, quote=True)}">',
            f'<meta name="twitter:description" content="{html.escape(description, quote=True)}">',
@@ -240,11 +322,24 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de
    return "\n        ".join(lines)


+def update_language_menu_links(document, rel_path, languages):
+    allowed_languages = set(languages)
+    target_path = html.escape(rel_path.as_posix(), quote=True)
+
+    def replace(match):
+        lang = match.group("lang")
+        if lang not in allowed_languages:
+            return match.group(0)
+        return f'{match.group("prefix")}/{lang}/{target_path}{match.group("suffix")}'
+
+    return LANGUAGE_MENU_LINK_RE.sub(replace, document)
+
+
 def update_document(document, site_url, lang, rel_path, languages, default_lang, site_name):
    title_match = re.search(r"<title>(.*?)</title>", document, flags=re.I | re.S)
    page_title = clean_text(title_match.group(1)) if title_match else site_name
    fallback_description = f"{site_name}: {page_title}"
-    description = extract_description(document, fallback_description)
+    description = homepage_description(site_name) if is_homepage(rel_path) else extract_description(document, fallback_description)
    seo_block = build_seo_block(
        site_url, lang, rel_path, languages, default_lang, page_title, description, site_name
    )
@@ -259,7 +354,7 @@ def update_document(document, site_url, lang, rel_path, languages, default_lang,
    if re.search(r'<meta\s+name="description"\s+content="[^"]*"\s*/?>', document, flags=re.I):
        document = re.sub(
            r'(<meta\s+name="description"\s+content=")[^"]*("\s*/?>)',
-            r"\1" + html.escape(description, quote=True) + r"\2",
+            lambda match: match.group(1) + html.escape(description, quote=True) + match.group(2),
            document,
            count=1,
            flags=re.I,
@@ -271,10 +366,48 @@ def update_document(document, site_url, lang, rel_path, languages, default_lang,
            1,
        )

-    document = re.sub(r"</head>", f"        {seo_block}\n    </head>", document, count=1, flags=re.I)
+    document = re.sub(
+        r"</head>",
+        lambda match: f"        {seo_block}\n    {match.group(0)}",
+        document,
+        count=1,
+        flags=re.I,
+    )
+    document = update_language_menu_links(document, rel_path, languages)
    return document


+def update_noindex_document(document, rel_path, languages):
+    if re.search(r'<meta\s+name="robots"\s+content="[^"]*"\s*/?>', document, flags=re.I):
+        document = re.sub(
+            r'(<meta\s+name="robots"\s+content=")[^"]*("\s*/?>)',
+            lambda match: match.group(1) + "noindex, follow" + match.group(2),
+            document,
+            count=1,
+            flags=re.I,
+        )
+    else:
+        document = re.sub(
+            r"</title>",
+            '</title>\n        <meta name="robots" content="noindex, follow">',
+            document,
+            count=1,
+            flags=re.I,
+        )
+    return update_language_menu_links(document, rel_path, languages)
+
+
+def process_noindex_pages(book_dir, languages):
+    for html_file in Path(book_dir).rglob("*.html"):
+        if html_file.name not in NOINDEX_HTML:
+            continue
+        rel_path = html_file.relative_to(book_dir)
+        content = html_file.read_text(encoding="utf-8")
+        updated = update_noindex_document(content, rel_path, languages)
+        if updated != content:
+            html_file.write_text(updated, encoding="utf-8")
+
+
 def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang):
    ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9")
    ET.register_namespace("xhtml", "http://www.w3.org/1999/xhtml")
@@ -287,8 +420,9 @@ def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang)
        ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = canonical_url(
            site_url, lang, rel_path
        )
-        lastmod = datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat()
-        ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = lastmod
+        ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = page_lastmod(
+            book_dir, rel_path, html_file
+        )

        for alt_lang in languages:
            ET.SubElement(
@@ -334,6 +468,7 @@ def process_pages(args):
        )
        html_file.write_text(updated, encoding="utf-8")

+    process_noindex_pages(book_dir, languages)
    generate_language_sitemap(book_dir, args.site_url, args.lang, languages, args.default_lang)