Improve SEO metadata and translation discovery

This commit is contained in:
Carlos Polop
2026-05-09 17:34:28 +02:00
parent d13c270d7f
commit 39db3878cf
3 changed files with 167 additions and 32 deletions
+147 -12
View File
@@ -2,6 +2,7 @@ import argparse
import html import html
import json import json
import re import re
import subprocess
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
@@ -28,8 +29,45 @@ DEFAULT_LANGUAGES = [
] ]
SKIP_HTML = {"404.html", "print.html", "toc.html"} SKIP_HTML = {"404.html", "print.html", "toc.html"}
NOINDEX_HTML = {"404.html", "print.html", "toc.html"}
SEO_START = "<!-- HT_SEO_START -->" SEO_START = "<!-- HT_SEO_START -->"
SEO_END = "<!-- HT_SEO_END -->" SEO_END = "<!-- HT_SEO_END -->"
SITE_DESCRIPTIONS = {
"HackTricks Cloud": (
"HackTricks Cloud is a practical cloud security knowledge base covering AWS, GCP, Azure, "
"Kubernetes, CI/CD, and workspace pentesting techniques."
),
}
LANGUAGE_LOCALES = {
"af": "af_ZA",
"de": "de_DE",
"el": "el_GR",
"en": "en_US",
"es": "es_ES",
"fr": "fr_FR",
"hi": "hi_IN",
"it": "it_IT",
"ja": "ja_JP",
"ko": "ko_KR",
"pl": "pl_PL",
"pt": "pt_PT",
"sr": "sr_RS",
"sw": "sw_KE",
"tr": "tr_TR",
"uk": "uk_UA",
"zh": "zh_CN",
}
DESCRIPTION_SKIP_PATTERNS = (
"hacktricks logos",
"learn & practice",
"learn and practice",
"hacktricks training",
"full hacktricks training catalog",
)
LANGUAGE_MENU_LINK_RE = re.compile(
r'(?P<prefix><a\s+id="(?P<lang>[a-z]{2})"\s+href=")/[a-z]{2}/[^"]*(?P<suffix>"\s+hreflang="(?P=lang)")',
flags=re.I,
)
def parse_args(): def parse_args():
@@ -46,17 +84,18 @@ def parse_args():
index_cmd = subparsers.add_parser("index") index_cmd = subparsers.add_parser("index")
index_cmd.add_argument("--site-url", required=True) index_cmd.add_argument("--site-url", required=True)
index_cmd.add_argument("--languages", required=True) index_cmd.add_argument("--languages", default=",".join(DEFAULT_LANGUAGES))
index_cmd.add_argument("--output", required=True) index_cmd.add_argument("--output", required=True)
return parser.parse_args() return parser.parse_args()
def parse_languages(raw): def parse_languages(raw):
supported = set(DEFAULT_LANGUAGES)
langs = [] langs = []
for item in raw.split(","): for item in raw.split(","):
code = item.strip() code = item.strip()
if re.fullmatch(r"[a-z]{2}", code): if re.fullmatch(r"[a-z]{2}", code) and code in supported:
langs.append(code) langs.append(code)
return sorted(set(langs)) return sorted(set(langs))
@@ -100,6 +139,13 @@ def trim_description(text, fallback):
return cut + "..." return cut + "..."
def is_low_value_description(text):
lowered = text.lower()
if any(pattern in lowered for pattern in DESCRIPTION_SKIP_PATTERNS):
return True
return len(re.sub(r"[\W_]+", "", text)) < 40
def extract_description(document, fallback): def extract_description(document, fallback):
main_match = re.search(r"<main\b[^>]*>(.*?)</main>", document, flags=re.I | re.S) main_match = re.search(r"<main\b[^>]*>(.*?)</main>", document, flags=re.I | re.S)
scope = main_match.group(1) if main_match else document scope = main_match.group(1) if main_match else document
@@ -107,12 +153,16 @@ def extract_description(document, fallback):
for pattern in (r"<p\b[^>]*>(.*?)</p>", r"<li\b[^>]*>(.*?)</li>", r"<h[12]\b[^>]*>(.*?)</h[12]>"): for pattern in (r"<p\b[^>]*>(.*?)</p>", r"<li\b[^>]*>(.*?)</li>", r"<h[12]\b[^>]*>(.*?)</h[12]>"):
for match in re.finditer(pattern, scope, flags=re.I | re.S): for match in re.finditer(pattern, scope, flags=re.I | re.S):
text = clean_text(match.group(1)) text = clean_text(match.group(1))
if len(text) >= 40: if len(text) >= 40 and not is_low_value_description(text):
return trim_description(text, fallback) return trim_description(text, fallback)
return trim_description(clean_text(scope), fallback) return trim_description(clean_text(scope), fallback)
def homepage_description(site_name):
return SITE_DESCRIPTIONS.get(site_name, f"{site_name}: practical cloud security guides and references.")
def strip_index_suffix(path): def strip_index_suffix(path):
return re.sub(r"(?:^|/)index\.html$", "", path.as_posix()) return re.sub(r"(?:^|/)index\.html$", "", path.as_posix())
@@ -121,6 +171,38 @@ def is_homepage(rel_path):
return rel_path.as_posix() == "index.html" return rel_path.as_posix() == "index.html"
def source_path_for_html(book_dir, rel_path):
repo_root = Path(book_dir).resolve().parent
if rel_path.name == "index.html":
source_rel = Path("src") / rel_path.parent / "README.md"
else:
source_rel = Path("src") / rel_path.with_suffix(".md")
source_path = repo_root / source_rel
return source_path if source_path.exists() else None
def git_lastmod(source_path):
try:
output = subprocess.check_output(
["git", "log", "-1", "--format=%cs", "--", str(source_path)],
cwd=source_path.parent,
stderr=subprocess.DEVNULL,
text=True,
).strip()
except (OSError, subprocess.CalledProcessError):
return None
return output or None
def page_lastmod(book_dir, rel_path, html_file):
source_path = source_path_for_html(book_dir, rel_path)
if source_path:
lastmod = git_lastmod(source_path)
if lastmod:
return lastmod
return datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat()
def humanize_slug(value): def humanize_slug(value):
value = value.replace(".html", "").replace("-", " ").replace("_", " ").strip() value = value.replace(".html", "").replace("-", " ").replace("_", " ").strip()
value = re.sub(r"\s+", " ", value) value = re.sub(r"\s+", " ", value)
@@ -140,7 +222,7 @@ def breadcrumb_items(site_url, lang, rel_path):
return items return items
def build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url): def build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url, languages):
current_url = canonical_url(site_url, lang, rel_path) current_url = canonical_url(site_url, lang, rel_path)
site_root = site_url.rstrip("/") site_root = site_url.rstrip("/")
website_url = canonical_url(site_url, "en", Path("index.html")) website_url = canonical_url(site_url, "en", Path("index.html"))
@@ -159,7 +241,7 @@ def build_structured_data(site_url, lang, rel_path, title, description, site_nam
"@id": f"{site_root}/#website", "@id": f"{site_root}/#website",
"url": site_root, "url": site_root,
"name": site_name, "name": site_name,
"inLanguage": "en", "inLanguage": languages,
"publisher": {"@id": f"{site_root}/#organization"}, "publisher": {"@id": f"{site_root}/#organization"},
}, },
{ {
@@ -203,7 +285,7 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de
current_url = canonical_url(site_url, lang, rel_path) current_url = canonical_url(site_url, lang, rel_path)
image_url = social_image_url(site_url) image_url = social_image_url(site_url)
structured_data = json.dumps( structured_data = json.dumps(
build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url), build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url, languages),
ensure_ascii=False, ensure_ascii=False,
separators=(",", ":"), separators=(",", ":"),
) )
@@ -228,7 +310,7 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de
f'<meta property="og:image:secure_url" content="{html.escape(image_url, quote=True)}">', f'<meta property="og:image:secure_url" content="{html.escape(image_url, quote=True)}">',
f'<meta property="og:image:type" content="image/svg+xml">', f'<meta property="og:image:type" content="image/svg+xml">',
f'<meta property="og:image:alt" content="{html.escape(site_name, quote=True)}">', f'<meta property="og:image:alt" content="{html.escape(site_name, quote=True)}">',
f'<meta property="og:locale" content="{html.escape(lang, quote=True)}">', f'<meta property="og:locale" content="{html.escape(LANGUAGE_LOCALES.get(lang, lang), quote=True)}">',
f'<meta name="twitter:card" content="summary_large_image">', f'<meta name="twitter:card" content="summary_large_image">',
f'<meta name="twitter:title" content="{html.escape(title, quote=True)}">', f'<meta name="twitter:title" content="{html.escape(title, quote=True)}">',
f'<meta name="twitter:description" content="{html.escape(description, quote=True)}">', f'<meta name="twitter:description" content="{html.escape(description, quote=True)}">',
@@ -240,11 +322,24 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de
return "\n ".join(lines) return "\n ".join(lines)
def update_language_menu_links(document, rel_path, languages):
allowed_languages = set(languages)
target_path = html.escape(rel_path.as_posix(), quote=True)
def replace(match):
lang = match.group("lang")
if lang not in allowed_languages:
return match.group(0)
return f'{match.group("prefix")}/{lang}/{target_path}{match.group("suffix")}'
return LANGUAGE_MENU_LINK_RE.sub(replace, document)
def update_document(document, site_url, lang, rel_path, languages, default_lang, site_name): def update_document(document, site_url, lang, rel_path, languages, default_lang, site_name):
title_match = re.search(r"<title>(.*?)</title>", document, flags=re.I | re.S) title_match = re.search(r"<title>(.*?)</title>", document, flags=re.I | re.S)
page_title = clean_text(title_match.group(1)) if title_match else site_name page_title = clean_text(title_match.group(1)) if title_match else site_name
fallback_description = f"{site_name}: {page_title}" fallback_description = f"{site_name}: {page_title}"
description = extract_description(document, fallback_description) description = homepage_description(site_name) if is_homepage(rel_path) else extract_description(document, fallback_description)
seo_block = build_seo_block( seo_block = build_seo_block(
site_url, lang, rel_path, languages, default_lang, page_title, description, site_name site_url, lang, rel_path, languages, default_lang, page_title, description, site_name
) )
@@ -259,7 +354,7 @@ def update_document(document, site_url, lang, rel_path, languages, default_lang,
if re.search(r'<meta\s+name="description"\s+content="[^"]*"\s*/?>', document, flags=re.I): if re.search(r'<meta\s+name="description"\s+content="[^"]*"\s*/?>', document, flags=re.I):
document = re.sub( document = re.sub(
r'(<meta\s+name="description"\s+content=")[^"]*("\s*/?>)', r'(<meta\s+name="description"\s+content=")[^"]*("\s*/?>)',
r"\1" + html.escape(description, quote=True) + r"\2", lambda match: match.group(1) + html.escape(description, quote=True) + match.group(2),
document, document,
count=1, count=1,
flags=re.I, flags=re.I,
@@ -271,10 +366,48 @@ def update_document(document, site_url, lang, rel_path, languages, default_lang,
1, 1,
) )
document = re.sub(r"</head>", f" {seo_block}\n </head>", document, count=1, flags=re.I) document = re.sub(
r"</head>",
lambda match: f" {seo_block}\n {match.group(0)}",
document,
count=1,
flags=re.I,
)
document = update_language_menu_links(document, rel_path, languages)
return document return document
def update_noindex_document(document, rel_path, languages):
if re.search(r'<meta\s+name="robots"\s+content="[^"]*"\s*/?>', document, flags=re.I):
document = re.sub(
r'(<meta\s+name="robots"\s+content=")[^"]*("\s*/?>)',
lambda match: match.group(1) + "noindex, follow" + match.group(2),
document,
count=1,
flags=re.I,
)
else:
document = re.sub(
r"</title>",
'</title>\n <meta name="robots" content="noindex, follow">',
document,
count=1,
flags=re.I,
)
return update_language_menu_links(document, rel_path, languages)
def process_noindex_pages(book_dir, languages):
for html_file in Path(book_dir).rglob("*.html"):
if html_file.name not in NOINDEX_HTML:
continue
rel_path = html_file.relative_to(book_dir)
content = html_file.read_text(encoding="utf-8")
updated = update_noindex_document(content, rel_path, languages)
if updated != content:
html_file.write_text(updated, encoding="utf-8")
def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang): def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang):
ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9") ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9")
ET.register_namespace("xhtml", "http://www.w3.org/1999/xhtml") ET.register_namespace("xhtml", "http://www.w3.org/1999/xhtml")
@@ -287,8 +420,9 @@ def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang)
ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = canonical_url( ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = canonical_url(
site_url, lang, rel_path site_url, lang, rel_path
) )
lastmod = datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat() ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = page_lastmod(
ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = lastmod book_dir, rel_path, html_file
)
for alt_lang in languages: for alt_lang in languages:
ET.SubElement( ET.SubElement(
@@ -334,6 +468,7 @@ def process_pages(args):
) )
html_file.write_text(updated, encoding="utf-8") html_file.write_text(updated, encoding="utf-8")
process_noindex_pages(book_dir, languages)
generate_language_sitemap(book_dir, args.site_url, args.lang, languages, args.default_lang) generate_language_sitemap(book_dir, args.site_url, args.lang, languages, args.default_lang)
+3 -3
View File
@@ -625,13 +625,14 @@ function playground_text(playground, hidden = true) {
// Should not be needed, but it works around an issue on macOS & iOS: https://github.com/rust-lang/mdBook/issues/628 // Should not be needed, but it works around an issue on macOS & iOS: https://github.com/rust-lang/mdBook/issues/628
document.addEventListener('click', function(e) { document.addEventListener('click', function(e) {
if (menubarLanguagePopup.style.display === 'block' && !menubarLanguageToggleButton.contains(e.target) && !menubarLanguagePopup.contains(e.target)) { if (menubarLanguagePopup.style.display === 'flex' && !menubarLanguageToggleButton.contains(e.target) && !menubarLanguagePopup.contains(e.target)) {
hideLanguage(); hideLanguage();
} }
}); });
languageButtons.forEach((btn) => { languageButtons.forEach((btn) => {
btn.addEventListener('click', function(e) { btn.addEventListener('click', function(e) {
e.preventDefault();
const regex = /(?:(?:\/)+(?<lang>[a-z]{2}(?=\/|$)))?(?<path>(?:\/)*.*)?/g const regex = /(?:(?:\/)+(?<lang>[a-z]{2}(?=\/|$)))?(?<path>(?:\/)*.*)?/g
var match = regex.exec(window.location.pathname) var match = regex.exec(window.location.pathname)
@@ -641,7 +642,7 @@ function playground_text(playground, hidden = true) {
const lang = match.groups.lang const lang = match.groups.lang
console.log(`Lang: ${lang}`) console.log(`Lang: ${lang}`)
window.location = `/${e.target.id}${path}${window.location.hash}` window.location = `/${e.currentTarget.id}${path}${window.location.hash}`
}); });
}) })
})(); })();
@@ -788,4 +789,3 @@ function playground_text(playground, hidden = true) {
document.addEventListener('scroll', updateBorder, { passive: true }); document.addEventListener('scroll', updateBorder, { passive: true });
})(); })();
})(); })();
+17 -17
View File
@@ -206,23 +206,23 @@
<span class="translations-caret" aria-hidden="true">▾</span> <span class="translations-caret" aria-hidden="true">▾</span>
<div id="menubar-languages-popup" class="menubar-languages-popup" aria-label="Language menu" role="language menu"> <div id="menubar-languages-popup" class="menubar-languages-popup" aria-label="Language menu" role="language menu">
<button id="af" role="menuitem" class="menu-bar-link">Afrikaans</button> <a id="af" href="/af/{{ path }}" hreflang="af" lang="af" role="menuitem" class="menu-bar-link">Afrikaans</a>
<button id="zh" role="menuitem" class="menu-bar-link">Chinese</button> <a id="zh" href="/zh/{{ path }}" hreflang="zh" lang="zh" role="menuitem" class="menu-bar-link">Chinese</a>
<button id="en" role="menuitem" class="menu-bar-link">English</button> <a id="en" href="/en/{{ path }}" hreflang="en" lang="en" role="menuitem" class="menu-bar-link">English</a>
<button id="fr" role="menuitem" class="menu-bar-link">French</button> <a id="fr" href="/fr/{{ path }}" hreflang="fr" lang="fr" role="menuitem" class="menu-bar-link">French</a>
<button id="de" role="menuitem" class="menu-bar-link">German</button> <a id="de" href="/de/{{ path }}" hreflang="de" lang="de" role="menuitem" class="menu-bar-link">German</a>
<button id="el" role="menuitem" class="menu-bar-link">Greek</button> <a id="el" href="/el/{{ path }}" hreflang="el" lang="el" role="menuitem" class="menu-bar-link">Greek</a>
<button id="hi" role="menuitem" class="menu-bar-link">Hindi</button> <a id="hi" href="/hi/{{ path }}" hreflang="hi" lang="hi" role="menuitem" class="menu-bar-link">Hindi</a>
<button id="it" role="menuitem" class="menu-bar-link">Italian</button> <a id="it" href="/it/{{ path }}" hreflang="it" lang="it" role="menuitem" class="menu-bar-link">Italian</a>
<button id="ja" role="menuitem" class="menu-bar-link">Japanese</button> <a id="ja" href="/ja/{{ path }}" hreflang="ja" lang="ja" role="menuitem" class="menu-bar-link">Japanese</a>
<button id="ko" role="menuitem" class="menu-bar-link">Korean</button> <a id="ko" href="/ko/{{ path }}" hreflang="ko" lang="ko" role="menuitem" class="menu-bar-link">Korean</a>
<button id="pl" role="menuitem" class="menu-bar-link">Polish</button> <a id="pl" href="/pl/{{ path }}" hreflang="pl" lang="pl" role="menuitem" class="menu-bar-link">Polish</a>
<button id="pt" role="menuitem" class="menu-bar-link">Portuguese</button> <a id="pt" href="/pt/{{ path }}" hreflang="pt" lang="pt" role="menuitem" class="menu-bar-link">Portuguese</a>
<button id="sr" role="menuitem" class="menu-bar-link">Serbian</button> <a id="sr" href="/sr/{{ path }}" hreflang="sr" lang="sr" role="menuitem" class="menu-bar-link">Serbian</a>
<button id="es" role="menuitem" class="menu-bar-link">Spanish</button> <a id="es" href="/es/{{ path }}" hreflang="es" lang="es" role="menuitem" class="menu-bar-link">Spanish</a>
<button id="sw" role="menuitem" class="menu-bar-link">Swahili</button> <a id="sw" href="/sw/{{ path }}" hreflang="sw" lang="sw" role="menuitem" class="menu-bar-link">Swahili</a>
<button id="tr" role="menuitem" class="menu-bar-link">Turkish</button> <a id="tr" href="/tr/{{ path }}" hreflang="tr" lang="tr" role="menuitem" class="menu-bar-link">Turkish</a>
<button id="uk" role="menuitem" class="menu-bar-link">Ukrainian</button> <a id="uk" href="/uk/{{ path }}" hreflang="uk" lang="uk" role="menuitem" class="menu-bar-link">Ukrainian</a>
</div> </div>
</div> </div>