diff --git a/scripts/seo_postprocess.py b/scripts/seo_postprocess.py index 98addefa1..b61432896 100644 --- a/scripts/seo_postprocess.py +++ b/scripts/seo_postprocess.py @@ -2,6 +2,7 @@ import argparse import html import json import re +import subprocess from datetime import datetime, timezone from pathlib import Path import xml.etree.ElementTree as ET @@ -28,8 +29,45 @@ DEFAULT_LANGUAGES = [ ] SKIP_HTML = {"404.html", "print.html", "toc.html"} +NOINDEX_HTML = {"404.html", "print.html", "toc.html"} SEO_START = "" SEO_END = "" +SITE_DESCRIPTIONS = { + "HackTricks Cloud": ( + "HackTricks Cloud is a practical cloud security knowledge base covering AWS, GCP, Azure, " + "Kubernetes, CI/CD, and workspace pentesting techniques." + ), +} +LANGUAGE_LOCALES = { + "af": "af_ZA", + "de": "de_DE", + "el": "el_GR", + "en": "en_US", + "es": "es_ES", + "fr": "fr_FR", + "hi": "hi_IN", + "it": "it_IT", + "ja": "ja_JP", + "ko": "ko_KR", + "pl": "pl_PL", + "pt": "pt_PT", + "sr": "sr_RS", + "sw": "sw_KE", + "tr": "tr_TR", + "uk": "uk_UA", + "zh": "zh_CN", +} +DESCRIPTION_SKIP_PATTERNS = ( + "hacktricks logos", + "learn & practice", + "learn and practice", + "hacktricks training", + "full hacktricks training catalog", +) +LANGUAGE_MENU_LINK_RE = re.compile( + r'(?P[a-z]{2})"\s+href=")/[a-z]{2}/[^"]*(?P"\s+hreflang="(?P=lang)")', + flags=re.I, +) def parse_args(): @@ -46,17 +84,18 @@ def parse_args(): index_cmd = subparsers.add_parser("index") index_cmd.add_argument("--site-url", required=True) - index_cmd.add_argument("--languages", required=True) + index_cmd.add_argument("--languages", default=",".join(DEFAULT_LANGUAGES)) index_cmd.add_argument("--output", required=True) return parser.parse_args() def parse_languages(raw): + supported = set(DEFAULT_LANGUAGES) langs = [] for item in raw.split(","): code = item.strip() - if re.fullmatch(r"[a-z]{2}", code): + if re.fullmatch(r"[a-z]{2}", code) and code in supported: langs.append(code) return sorted(set(langs)) @@ -100,6 +139,13 @@ def trim_description(text, fallback): return cut + "..." +def is_low_value_description(text): + lowered = text.lower() + if any(pattern in lowered for pattern in DESCRIPTION_SKIP_PATTERNS): + return True + return len(re.sub(r"[\W_]+", "", text)) < 40 + + def extract_description(document, fallback): main_match = re.search(r"]*>(.*?)", document, flags=re.I | re.S) scope = main_match.group(1) if main_match else document @@ -107,12 +153,16 @@ def extract_description(document, fallback): for pattern in (r"]*>(.*?)

", r"]*>(.*?)", r"]*>(.*?)"): for match in re.finditer(pattern, scope, flags=re.I | re.S): text = clean_text(match.group(1)) - if len(text) >= 40: + if len(text) >= 40 and not is_low_value_description(text): return trim_description(text, fallback) return trim_description(clean_text(scope), fallback) +def homepage_description(site_name): + return SITE_DESCRIPTIONS.get(site_name, f"{site_name}: practical cloud security guides and references.") + + def strip_index_suffix(path): return re.sub(r"(?:^|/)index\.html$", "", path.as_posix()) @@ -121,6 +171,38 @@ def is_homepage(rel_path): return rel_path.as_posix() == "index.html" +def source_path_for_html(book_dir, rel_path): + repo_root = Path(book_dir).resolve().parent + if rel_path.name == "index.html": + source_rel = Path("src") / rel_path.parent / "README.md" + else: + source_rel = Path("src") / rel_path.with_suffix(".md") + source_path = repo_root / source_rel + return source_path if source_path.exists() else None + + +def git_lastmod(source_path): + try: + output = subprocess.check_output( + ["git", "log", "-1", "--format=%cs", "--", str(source_path)], + cwd=source_path.parent, + stderr=subprocess.DEVNULL, + text=True, + ).strip() + except (OSError, subprocess.CalledProcessError): + return None + return output or None + + +def page_lastmod(book_dir, rel_path, html_file): + source_path = source_path_for_html(book_dir, rel_path) + if source_path: + lastmod = git_lastmod(source_path) + if lastmod: + return lastmod + return datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat() + + def humanize_slug(value): value = value.replace(".html", "").replace("-", " ").replace("_", " ").strip() value = re.sub(r"\s+", " ", value) @@ -140,7 +222,7 @@ def breadcrumb_items(site_url, lang, rel_path): return items -def build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url): +def build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url, languages): current_url = canonical_url(site_url, lang, rel_path) site_root = site_url.rstrip("/") website_url = canonical_url(site_url, "en", Path("index.html")) @@ -159,7 +241,7 @@ def build_structured_data(site_url, lang, rel_path, title, description, site_nam "@id": f"{site_root}/#website", "url": site_root, "name": site_name, - "inLanguage": "en", + "inLanguage": languages, "publisher": {"@id": f"{site_root}/#organization"}, }, { @@ -203,7 +285,7 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de current_url = canonical_url(site_url, lang, rel_path) image_url = social_image_url(site_url) structured_data = json.dumps( - build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url), + build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url, languages), ensure_ascii=False, separators=(",", ":"), ) @@ -228,7 +310,7 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de f'', f'', f'', - f'', + f'', f'', f'', f'', @@ -240,11 +322,24 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de return "\n ".join(lines) +def update_language_menu_links(document, rel_path, languages): + allowed_languages = set(languages) + target_path = html.escape(rel_path.as_posix(), quote=True) + + def replace(match): + lang = match.group("lang") + if lang not in allowed_languages: + return match.group(0) + return f'{match.group("prefix")}/{lang}/{target_path}{match.group("suffix")}' + + return LANGUAGE_MENU_LINK_RE.sub(replace, document) + + def update_document(document, site_url, lang, rel_path, languages, default_lang, site_name): title_match = re.search(r"(.*?)", document, flags=re.I | re.S) page_title = clean_text(title_match.group(1)) if title_match else site_name fallback_description = f"{site_name}: {page_title}" - description = extract_description(document, fallback_description) + description = homepage_description(site_name) if is_homepage(rel_path) else extract_description(document, fallback_description) seo_block = build_seo_block( site_url, lang, rel_path, languages, default_lang, page_title, description, site_name ) @@ -259,7 +354,7 @@ def update_document(document, site_url, lang, rel_path, languages, default_lang, if re.search(r'', document, flags=re.I): document = re.sub( r'()', - r"\1" + html.escape(description, quote=True) + r"\2", + lambda match: match.group(1) + html.escape(description, quote=True) + match.group(2), document, count=1, flags=re.I, @@ -271,10 +366,48 @@ def update_document(document, site_url, lang, rel_path, languages, default_lang, 1, ) - document = re.sub(r"", f" {seo_block}\n ", document, count=1, flags=re.I) + document = re.sub( + r"", + lambda match: f" {seo_block}\n {match.group(0)}", + document, + count=1, + flags=re.I, + ) + document = update_language_menu_links(document, rel_path, languages) return document +def update_noindex_document(document, rel_path, languages): + if re.search(r'', document, flags=re.I): + document = re.sub( + r'()', + lambda match: match.group(1) + "noindex, follow" + match.group(2), + document, + count=1, + flags=re.I, + ) + else: + document = re.sub( + r"", + '\n ', + document, + count=1, + flags=re.I, + ) + return update_language_menu_links(document, rel_path, languages) + + +def process_noindex_pages(book_dir, languages): + for html_file in Path(book_dir).rglob("*.html"): + if html_file.name not in NOINDEX_HTML: + continue + rel_path = html_file.relative_to(book_dir) + content = html_file.read_text(encoding="utf-8") + updated = update_noindex_document(content, rel_path, languages) + if updated != content: + html_file.write_text(updated, encoding="utf-8") + + def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang): ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9") ET.register_namespace("xhtml", "http://www.w3.org/1999/xhtml") @@ -287,8 +420,9 @@ def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang) ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = canonical_url( site_url, lang, rel_path ) - lastmod = datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat() - ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = lastmod + ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = page_lastmod( + book_dir, rel_path, html_file + ) for alt_lang in languages: ET.SubElement( @@ -334,6 +468,7 @@ def process_pages(args): ) html_file.write_text(updated, encoding="utf-8") + process_noindex_pages(book_dir, languages) generate_language_sitemap(book_dir, args.site_url, args.lang, languages, args.default_lang) diff --git a/theme/book.js b/theme/book.js index b11674b5a..ec2f0679a 100644 --- a/theme/book.js +++ b/theme/book.js @@ -625,13 +625,14 @@ function playground_text(playground, hidden = true) { // Should not be needed, but it works around an issue on macOS & iOS: https://github.com/rust-lang/mdBook/issues/628 document.addEventListener('click', function(e) { - if (menubarLanguagePopup.style.display === 'block' && !menubarLanguageToggleButton.contains(e.target) && !menubarLanguagePopup.contains(e.target)) { + if (menubarLanguagePopup.style.display === 'flex' && !menubarLanguageToggleButton.contains(e.target) && !menubarLanguagePopup.contains(e.target)) { hideLanguage(); } }); languageButtons.forEach((btn) => { btn.addEventListener('click', function(e) { + e.preventDefault(); const regex = /(?:(?:\/)+(?[a-z]{2}(?=\/|$)))?(?(?:\/)*.*)?/g var match = regex.exec(window.location.pathname) @@ -641,7 +642,7 @@ function playground_text(playground, hidden = true) { const lang = match.groups.lang console.log(`Lang: ${lang}`) - window.location = `/${e.target.id}${path}${window.location.hash}` + window.location = `/${e.currentTarget.id}${path}${window.location.hash}` }); }) })(); @@ -788,4 +789,3 @@ function playground_text(playground, hidden = true) { document.addEventListener('scroll', updateBorder, { passive: true }); })(); })(); - diff --git a/theme/index.hbs b/theme/index.hbs index 41fa91d1d..2c8040f54 100644 --- a/theme/index.hbs +++ b/theme/index.hbs @@ -206,23 +206,23 @@