diff --git a/scripts/seo_postprocess.py b/scripts/seo_postprocess.py index 5ec45b107..a74343fe9 100644 --- a/scripts/seo_postprocess.py +++ b/scripts/seo_postprocess.py @@ -1,5 +1,6 @@ import argparse import html +import json import re from datetime import datetime, timezone from pathlib import Path @@ -71,6 +72,10 @@ def canonical_url(site_url, lang, rel_path): return f"{site_url.rstrip('/')}/{lang}/{rel_path.as_posix()}" +def asset_url(site_url, lang, asset_path): + return f"{site_url.rstrip('/')}/{lang}/{asset_path.lstrip('/')}" + + def clean_text(fragment): fragment = re.sub(r"]*>.*?", " ", fragment, flags=re.I | re.S) fragment = re.sub(r"]*>.*?", " ", fragment, flags=re.I | re.S) @@ -104,8 +109,100 @@ def extract_description(document, fallback): return trim_description(clean_text(scope), fallback) -def build_seo_block(site_url, lang, rel_path, languages, default_lang): +def strip_index_suffix(path): + return re.sub(r"(?:^|/)index\.html$", "", path.as_posix()) + + +def is_homepage(rel_path): + return rel_path.as_posix() == "index.html" + + +def humanize_slug(value): + value = value.replace(".html", "").replace("-", " ").replace("_", " ").strip() + value = re.sub(r"\s+", " ", value) + return value.title() if value else "Home" + + +def breadcrumb_items(site_url, lang, rel_path): + items = [{"name": "Home", "url": canonical_url(site_url, lang, Path("index.html"))}] + bare_path = strip_index_suffix(rel_path) + if not bare_path: + return items + + parts = [part for part in bare_path.split("/") if part] + for idx in range(len(parts)): + crumb_rel = Path(*parts[: idx + 1], "index.html") + items.append({"name": humanize_slug(parts[idx]), "url": canonical_url(site_url, lang, crumb_rel)}) + return items + + +def build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url): current_url = canonical_url(site_url, lang, rel_path) + site_root = site_url.rstrip("/") + website_url = canonical_url(site_url, "en", Path("index.html")) + data = [ + { + "@context": "https://schema.org", + "@type": "Organization", + "@id": f"{site_root}/#organization", + "name": site_name, + "url": site_root, + "logo": {"@type": "ImageObject", "url": image_url}, + }, + { + "@context": "https://schema.org", + "@type": "WebSite", + "@id": f"{site_root}/#website", + "url": site_root, + "name": site_name, + "inLanguage": "en", + "publisher": {"@id": f"{site_root}/#organization"}, + }, + { + "@context": "https://schema.org", + "@type": "WebPage", + "@id": f"{current_url}#webpage", + "url": current_url, + "name": title, + "description": description, + "inLanguage": lang, + "isPartOf": {"@id": f"{site_root}/#website"}, + "about": {"@id": f"{site_root}/#organization"}, + "primaryImageOfPage": {"@type": "ImageObject", "url": image_url}, + }, + { + "@context": "https://schema.org", + "@type": "BreadcrumbList", + "itemListElement": [ + { + "@type": "ListItem", + "position": index, + "name": item["name"], + "item": item["url"], + } + for index, item in enumerate(breadcrumb_items(site_url, lang, rel_path), start=1) + ], + }, + ] + + if is_homepage(rel_path): + data[1]["potentialAction"] = { + "@type": "SearchAction", + "target": f"{website_url}?search={{search_term_string}}", + "query-input": "required name=search_term_string", + } + + return data + + +def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, description, site_name): + current_url = canonical_url(site_url, lang, rel_path) + image_url = asset_url(site_url, default_lang, "favicon.png") + structured_data = json.dumps( + build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url), + ensure_ascii=False, + separators=(",", ":"), + ) lines = [SEO_START, f''] for alt_lang in languages: @@ -116,6 +213,23 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang): default_url = canonical_url(site_url, default_lang, rel_path) lines.append(f'') + lines.extend( + [ + f'', + f'', + f'', + f'', + f'', + f'', + f'', + f'', + f'', + f'', + f'', + f'', + '", + ] + ) lines.append(SEO_END) return "\n ".join(lines) @@ -125,7 +239,9 @@ def update_document(document, site_url, lang, rel_path, languages, default_lang, page_title = clean_text(title_match.group(1)) if title_match else site_name fallback_description = f"{site_name}: {page_title}" description = extract_description(document, fallback_description) - seo_block = build_seo_block(site_url, lang, rel_path, languages, default_lang) + seo_block = build_seo_block( + site_url, lang, rel_path, languages, default_lang, page_title, description, site_name + ) document = re.sub( r"\s*.*?\s*",