mirror of
https://github.com/HackTricks-wiki/hacktricks-cloud.git
synced 2026-06-12 11:01:38 -07:00
Improve SEO metadata and translation discovery
This commit is contained in:
+147
-12
@@ -2,6 +2,7 @@ import argparse
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
import xml.etree.ElementTree as ET
|
||||
@@ -28,8 +29,45 @@ DEFAULT_LANGUAGES = [
|
||||
]
|
||||
|
||||
SKIP_HTML = {"404.html", "print.html", "toc.html"}
|
||||
NOINDEX_HTML = {"404.html", "print.html", "toc.html"}
|
||||
SEO_START = "<!-- HT_SEO_START -->"
|
||||
SEO_END = "<!-- HT_SEO_END -->"
|
||||
SITE_DESCRIPTIONS = {
|
||||
"HackTricks Cloud": (
|
||||
"HackTricks Cloud is a practical cloud security knowledge base covering AWS, GCP, Azure, "
|
||||
"Kubernetes, CI/CD, and workspace pentesting techniques."
|
||||
),
|
||||
}
|
||||
LANGUAGE_LOCALES = {
|
||||
"af": "af_ZA",
|
||||
"de": "de_DE",
|
||||
"el": "el_GR",
|
||||
"en": "en_US",
|
||||
"es": "es_ES",
|
||||
"fr": "fr_FR",
|
||||
"hi": "hi_IN",
|
||||
"it": "it_IT",
|
||||
"ja": "ja_JP",
|
||||
"ko": "ko_KR",
|
||||
"pl": "pl_PL",
|
||||
"pt": "pt_PT",
|
||||
"sr": "sr_RS",
|
||||
"sw": "sw_KE",
|
||||
"tr": "tr_TR",
|
||||
"uk": "uk_UA",
|
||||
"zh": "zh_CN",
|
||||
}
|
||||
DESCRIPTION_SKIP_PATTERNS = (
|
||||
"hacktricks logos",
|
||||
"learn & practice",
|
||||
"learn and practice",
|
||||
"hacktricks training",
|
||||
"full hacktricks training catalog",
|
||||
)
|
||||
LANGUAGE_MENU_LINK_RE = re.compile(
|
||||
r'(?P<prefix><a\s+id="(?P<lang>[a-z]{2})"\s+href=")/[a-z]{2}/[^"]*(?P<suffix>"\s+hreflang="(?P=lang)")',
|
||||
flags=re.I,
|
||||
)
|
||||
|
||||
|
||||
def parse_args():
|
||||
@@ -46,17 +84,18 @@ def parse_args():
|
||||
|
||||
index_cmd = subparsers.add_parser("index")
|
||||
index_cmd.add_argument("--site-url", required=True)
|
||||
index_cmd.add_argument("--languages", required=True)
|
||||
index_cmd.add_argument("--languages", default=",".join(DEFAULT_LANGUAGES))
|
||||
index_cmd.add_argument("--output", required=True)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def parse_languages(raw):
|
||||
supported = set(DEFAULT_LANGUAGES)
|
||||
langs = []
|
||||
for item in raw.split(","):
|
||||
code = item.strip()
|
||||
if re.fullmatch(r"[a-z]{2}", code):
|
||||
if re.fullmatch(r"[a-z]{2}", code) and code in supported:
|
||||
langs.append(code)
|
||||
return sorted(set(langs))
|
||||
|
||||
@@ -100,6 +139,13 @@ def trim_description(text, fallback):
|
||||
return cut + "..."
|
||||
|
||||
|
||||
def is_low_value_description(text):
|
||||
lowered = text.lower()
|
||||
if any(pattern in lowered for pattern in DESCRIPTION_SKIP_PATTERNS):
|
||||
return True
|
||||
return len(re.sub(r"[\W_]+", "", text)) < 40
|
||||
|
||||
|
||||
def extract_description(document, fallback):
|
||||
main_match = re.search(r"<main\b[^>]*>(.*?)</main>", document, flags=re.I | re.S)
|
||||
scope = main_match.group(1) if main_match else document
|
||||
@@ -107,12 +153,16 @@ def extract_description(document, fallback):
|
||||
for pattern in (r"<p\b[^>]*>(.*?)</p>", r"<li\b[^>]*>(.*?)</li>", r"<h[12]\b[^>]*>(.*?)</h[12]>"):
|
||||
for match in re.finditer(pattern, scope, flags=re.I | re.S):
|
||||
text = clean_text(match.group(1))
|
||||
if len(text) >= 40:
|
||||
if len(text) >= 40 and not is_low_value_description(text):
|
||||
return trim_description(text, fallback)
|
||||
|
||||
return trim_description(clean_text(scope), fallback)
|
||||
|
||||
|
||||
def homepage_description(site_name):
|
||||
return SITE_DESCRIPTIONS.get(site_name, f"{site_name}: practical cloud security guides and references.")
|
||||
|
||||
|
||||
def strip_index_suffix(path):
|
||||
return re.sub(r"(?:^|/)index\.html$", "", path.as_posix())
|
||||
|
||||
@@ -121,6 +171,38 @@ def is_homepage(rel_path):
|
||||
return rel_path.as_posix() == "index.html"
|
||||
|
||||
|
||||
def source_path_for_html(book_dir, rel_path):
|
||||
repo_root = Path(book_dir).resolve().parent
|
||||
if rel_path.name == "index.html":
|
||||
source_rel = Path("src") / rel_path.parent / "README.md"
|
||||
else:
|
||||
source_rel = Path("src") / rel_path.with_suffix(".md")
|
||||
source_path = repo_root / source_rel
|
||||
return source_path if source_path.exists() else None
|
||||
|
||||
|
||||
def git_lastmod(source_path):
|
||||
try:
|
||||
output = subprocess.check_output(
|
||||
["git", "log", "-1", "--format=%cs", "--", str(source_path)],
|
||||
cwd=source_path.parent,
|
||||
stderr=subprocess.DEVNULL,
|
||||
text=True,
|
||||
).strip()
|
||||
except (OSError, subprocess.CalledProcessError):
|
||||
return None
|
||||
return output or None
|
||||
|
||||
|
||||
def page_lastmod(book_dir, rel_path, html_file):
|
||||
source_path = source_path_for_html(book_dir, rel_path)
|
||||
if source_path:
|
||||
lastmod = git_lastmod(source_path)
|
||||
if lastmod:
|
||||
return lastmod
|
||||
return datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat()
|
||||
|
||||
|
||||
def humanize_slug(value):
|
||||
value = value.replace(".html", "").replace("-", " ").replace("_", " ").strip()
|
||||
value = re.sub(r"\s+", " ", value)
|
||||
@@ -140,7 +222,7 @@ def breadcrumb_items(site_url, lang, rel_path):
|
||||
return items
|
||||
|
||||
|
||||
def build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url):
|
||||
def build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url, languages):
|
||||
current_url = canonical_url(site_url, lang, rel_path)
|
||||
site_root = site_url.rstrip("/")
|
||||
website_url = canonical_url(site_url, "en", Path("index.html"))
|
||||
@@ -159,7 +241,7 @@ def build_structured_data(site_url, lang, rel_path, title, description, site_nam
|
||||
"@id": f"{site_root}/#website",
|
||||
"url": site_root,
|
||||
"name": site_name,
|
||||
"inLanguage": "en",
|
||||
"inLanguage": languages,
|
||||
"publisher": {"@id": f"{site_root}/#organization"},
|
||||
},
|
||||
{
|
||||
@@ -203,7 +285,7 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de
|
||||
current_url = canonical_url(site_url, lang, rel_path)
|
||||
image_url = social_image_url(site_url)
|
||||
structured_data = json.dumps(
|
||||
build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url),
|
||||
build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url, languages),
|
||||
ensure_ascii=False,
|
||||
separators=(",", ":"),
|
||||
)
|
||||
@@ -228,7 +310,7 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de
|
||||
f'<meta property="og:image:secure_url" content="{html.escape(image_url, quote=True)}">',
|
||||
f'<meta property="og:image:type" content="image/svg+xml">',
|
||||
f'<meta property="og:image:alt" content="{html.escape(site_name, quote=True)}">',
|
||||
f'<meta property="og:locale" content="{html.escape(lang, quote=True)}">',
|
||||
f'<meta property="og:locale" content="{html.escape(LANGUAGE_LOCALES.get(lang, lang), quote=True)}">',
|
||||
f'<meta name="twitter:card" content="summary_large_image">',
|
||||
f'<meta name="twitter:title" content="{html.escape(title, quote=True)}">',
|
||||
f'<meta name="twitter:description" content="{html.escape(description, quote=True)}">',
|
||||
@@ -240,11 +322,24 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, de
|
||||
return "\n ".join(lines)
|
||||
|
||||
|
||||
def update_language_menu_links(document, rel_path, languages):
|
||||
allowed_languages = set(languages)
|
||||
target_path = html.escape(rel_path.as_posix(), quote=True)
|
||||
|
||||
def replace(match):
|
||||
lang = match.group("lang")
|
||||
if lang not in allowed_languages:
|
||||
return match.group(0)
|
||||
return f'{match.group("prefix")}/{lang}/{target_path}{match.group("suffix")}'
|
||||
|
||||
return LANGUAGE_MENU_LINK_RE.sub(replace, document)
|
||||
|
||||
|
||||
def update_document(document, site_url, lang, rel_path, languages, default_lang, site_name):
|
||||
title_match = re.search(r"<title>(.*?)</title>", document, flags=re.I | re.S)
|
||||
page_title = clean_text(title_match.group(1)) if title_match else site_name
|
||||
fallback_description = f"{site_name}: {page_title}"
|
||||
description = extract_description(document, fallback_description)
|
||||
description = homepage_description(site_name) if is_homepage(rel_path) else extract_description(document, fallback_description)
|
||||
seo_block = build_seo_block(
|
||||
site_url, lang, rel_path, languages, default_lang, page_title, description, site_name
|
||||
)
|
||||
@@ -259,7 +354,7 @@ def update_document(document, site_url, lang, rel_path, languages, default_lang,
|
||||
if re.search(r'<meta\s+name="description"\s+content="[^"]*"\s*/?>', document, flags=re.I):
|
||||
document = re.sub(
|
||||
r'(<meta\s+name="description"\s+content=")[^"]*("\s*/?>)',
|
||||
r"\1" + html.escape(description, quote=True) + r"\2",
|
||||
lambda match: match.group(1) + html.escape(description, quote=True) + match.group(2),
|
||||
document,
|
||||
count=1,
|
||||
flags=re.I,
|
||||
@@ -271,10 +366,48 @@ def update_document(document, site_url, lang, rel_path, languages, default_lang,
|
||||
1,
|
||||
)
|
||||
|
||||
document = re.sub(r"</head>", f" {seo_block}\n </head>", document, count=1, flags=re.I)
|
||||
document = re.sub(
|
||||
r"</head>",
|
||||
lambda match: f" {seo_block}\n {match.group(0)}",
|
||||
document,
|
||||
count=1,
|
||||
flags=re.I,
|
||||
)
|
||||
document = update_language_menu_links(document, rel_path, languages)
|
||||
return document
|
||||
|
||||
|
||||
def update_noindex_document(document, rel_path, languages):
|
||||
if re.search(r'<meta\s+name="robots"\s+content="[^"]*"\s*/?>', document, flags=re.I):
|
||||
document = re.sub(
|
||||
r'(<meta\s+name="robots"\s+content=")[^"]*("\s*/?>)',
|
||||
lambda match: match.group(1) + "noindex, follow" + match.group(2),
|
||||
document,
|
||||
count=1,
|
||||
flags=re.I,
|
||||
)
|
||||
else:
|
||||
document = re.sub(
|
||||
r"</title>",
|
||||
'</title>\n <meta name="robots" content="noindex, follow">',
|
||||
document,
|
||||
count=1,
|
||||
flags=re.I,
|
||||
)
|
||||
return update_language_menu_links(document, rel_path, languages)
|
||||
|
||||
|
||||
def process_noindex_pages(book_dir, languages):
|
||||
for html_file in Path(book_dir).rglob("*.html"):
|
||||
if html_file.name not in NOINDEX_HTML:
|
||||
continue
|
||||
rel_path = html_file.relative_to(book_dir)
|
||||
content = html_file.read_text(encoding="utf-8")
|
||||
updated = update_noindex_document(content, rel_path, languages)
|
||||
if updated != content:
|
||||
html_file.write_text(updated, encoding="utf-8")
|
||||
|
||||
|
||||
def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang):
|
||||
ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9")
|
||||
ET.register_namespace("xhtml", "http://www.w3.org/1999/xhtml")
|
||||
@@ -287,8 +420,9 @@ def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang)
|
||||
ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = canonical_url(
|
||||
site_url, lang, rel_path
|
||||
)
|
||||
lastmod = datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat()
|
||||
ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = lastmod
|
||||
ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = page_lastmod(
|
||||
book_dir, rel_path, html_file
|
||||
)
|
||||
|
||||
for alt_lang in languages:
|
||||
ET.SubElement(
|
||||
@@ -334,6 +468,7 @@ def process_pages(args):
|
||||
)
|
||||
html_file.write_text(updated, encoding="utf-8")
|
||||
|
||||
process_noindex_pages(book_dir, languages)
|
||||
generate_language_sitemap(book_dir, args.site_url, args.lang, languages, args.default_lang)
|
||||
|
||||
|
||||
|
||||
+3
-3
@@ -625,13 +625,14 @@ function playground_text(playground, hidden = true) {
|
||||
|
||||
// Should not be needed, but it works around an issue on macOS & iOS: https://github.com/rust-lang/mdBook/issues/628
|
||||
document.addEventListener('click', function(e) {
|
||||
if (menubarLanguagePopup.style.display === 'block' && !menubarLanguageToggleButton.contains(e.target) && !menubarLanguagePopup.contains(e.target)) {
|
||||
if (menubarLanguagePopup.style.display === 'flex' && !menubarLanguageToggleButton.contains(e.target) && !menubarLanguagePopup.contains(e.target)) {
|
||||
hideLanguage();
|
||||
}
|
||||
});
|
||||
|
||||
languageButtons.forEach((btn) => {
|
||||
btn.addEventListener('click', function(e) {
|
||||
e.preventDefault();
|
||||
const regex = /(?:(?:\/)+(?<lang>[a-z]{2}(?=\/|$)))?(?<path>(?:\/)*.*)?/g
|
||||
var match = regex.exec(window.location.pathname)
|
||||
|
||||
@@ -641,7 +642,7 @@ function playground_text(playground, hidden = true) {
|
||||
const lang = match.groups.lang
|
||||
console.log(`Lang: ${lang}`)
|
||||
|
||||
window.location = `/${e.target.id}${path}${window.location.hash}`
|
||||
window.location = `/${e.currentTarget.id}${path}${window.location.hash}`
|
||||
});
|
||||
})
|
||||
})();
|
||||
@@ -788,4 +789,3 @@ function playground_text(playground, hidden = true) {
|
||||
document.addEventListener('scroll', updateBorder, { passive: true });
|
||||
})();
|
||||
})();
|
||||
|
||||
|
||||
+17
-17
@@ -206,23 +206,23 @@
|
||||
<span class="translations-caret" aria-hidden="true">▾</span>
|
||||
|
||||
<div id="menubar-languages-popup" class="menubar-languages-popup" aria-label="Language menu" role="language menu">
|
||||
<button id="af" role="menuitem" class="menu-bar-link">Afrikaans</button>
|
||||
<button id="zh" role="menuitem" class="menu-bar-link">Chinese</button>
|
||||
<button id="en" role="menuitem" class="menu-bar-link">English</button>
|
||||
<button id="fr" role="menuitem" class="menu-bar-link">French</button>
|
||||
<button id="de" role="menuitem" class="menu-bar-link">German</button>
|
||||
<button id="el" role="menuitem" class="menu-bar-link">Greek</button>
|
||||
<button id="hi" role="menuitem" class="menu-bar-link">Hindi</button>
|
||||
<button id="it" role="menuitem" class="menu-bar-link">Italian</button>
|
||||
<button id="ja" role="menuitem" class="menu-bar-link">Japanese</button>
|
||||
<button id="ko" role="menuitem" class="menu-bar-link">Korean</button>
|
||||
<button id="pl" role="menuitem" class="menu-bar-link">Polish</button>
|
||||
<button id="pt" role="menuitem" class="menu-bar-link">Portuguese</button>
|
||||
<button id="sr" role="menuitem" class="menu-bar-link">Serbian</button>
|
||||
<button id="es" role="menuitem" class="menu-bar-link">Spanish</button>
|
||||
<button id="sw" role="menuitem" class="menu-bar-link">Swahili</button>
|
||||
<button id="tr" role="menuitem" class="menu-bar-link">Turkish</button>
|
||||
<button id="uk" role="menuitem" class="menu-bar-link">Ukrainian</button>
|
||||
<a id="af" href="/af/{{ path }}" hreflang="af" lang="af" role="menuitem" class="menu-bar-link">Afrikaans</a>
|
||||
<a id="zh" href="/zh/{{ path }}" hreflang="zh" lang="zh" role="menuitem" class="menu-bar-link">Chinese</a>
|
||||
<a id="en" href="/en/{{ path }}" hreflang="en" lang="en" role="menuitem" class="menu-bar-link">English</a>
|
||||
<a id="fr" href="/fr/{{ path }}" hreflang="fr" lang="fr" role="menuitem" class="menu-bar-link">French</a>
|
||||
<a id="de" href="/de/{{ path }}" hreflang="de" lang="de" role="menuitem" class="menu-bar-link">German</a>
|
||||
<a id="el" href="/el/{{ path }}" hreflang="el" lang="el" role="menuitem" class="menu-bar-link">Greek</a>
|
||||
<a id="hi" href="/hi/{{ path }}" hreflang="hi" lang="hi" role="menuitem" class="menu-bar-link">Hindi</a>
|
||||
<a id="it" href="/it/{{ path }}" hreflang="it" lang="it" role="menuitem" class="menu-bar-link">Italian</a>
|
||||
<a id="ja" href="/ja/{{ path }}" hreflang="ja" lang="ja" role="menuitem" class="menu-bar-link">Japanese</a>
|
||||
<a id="ko" href="/ko/{{ path }}" hreflang="ko" lang="ko" role="menuitem" class="menu-bar-link">Korean</a>
|
||||
<a id="pl" href="/pl/{{ path }}" hreflang="pl" lang="pl" role="menuitem" class="menu-bar-link">Polish</a>
|
||||
<a id="pt" href="/pt/{{ path }}" hreflang="pt" lang="pt" role="menuitem" class="menu-bar-link">Portuguese</a>
|
||||
<a id="sr" href="/sr/{{ path }}" hreflang="sr" lang="sr" role="menuitem" class="menu-bar-link">Serbian</a>
|
||||
<a id="es" href="/es/{{ path }}" hreflang="es" lang="es" role="menuitem" class="menu-bar-link">Spanish</a>
|
||||
<a id="sw" href="/sw/{{ path }}" hreflang="sw" lang="sw" role="menuitem" class="menu-bar-link">Swahili</a>
|
||||
<a id="tr" href="/tr/{{ path }}" hreflang="tr" lang="tr" role="menuitem" class="menu-bar-link">Turkish</a>
|
||||
<a id="uk" href="/uk/{{ path }}" hreflang="uk" lang="uk" role="menuitem" class="menu-bar-link">Ukrainian</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
Reference in New Issue
Block a user