""" HTML parsing utilities with optional lxml support. This module provides comprehensive HTML parsing capabilities using either Python's built-in html.parser or lxml for better performance when available. """ # TODO: Review and optimize the HTML parsing logic for better performance and flexibility. # Consider adding more utility functions for common HTML manipulation tasks. import logging import re from html.parser import HTMLParser as BaseHTMLParser from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union if TYPE_CHECKING: from lxml import etree logger = logging.getLogger(__name__) # Try to import lxml HAS_LXML = False try: from lxml import etree, html as lxml_html HAS_LXML = True logger.debug("lxml is available and will be used for HTML parsing") except ImportError: logger.debug("lxml not available, falling back to html.parser") class HTMLParserConfig: """Configuration for HTML parser selection.""" def __init__(self, use_lxml: Optional[bool] = None): """ Initialize parser configuration. Args: use_lxml: Force use of lxml (True), html.parser (False), or auto-detect (None) """ if use_lxml is None: self.use_lxml = HAS_LXML else: self.use_lxml = use_lxml and HAS_LXML if use_lxml and not HAS_LXML: logger.warning( "lxml requested but not available, falling back to html.parser" ) class HTMLParser: """ Comprehensive HTML parser with optional lxml support. Provides a unified interface for HTML parsing operations regardless of the underlying parser implementation. """ def __init__(self, config: Optional[HTMLParserConfig] = None): """Initialize the HTML parser with configuration.""" self.config = config or HTMLParserConfig() def parse(self, html_content: str) -> Union[Any, "ParsedHTML"]: """ Parse HTML content and return a parsed tree. Args: html_content: Raw HTML string to parse Returns: Parsed HTML tree (lxml Element or custom ParsedHTML object) """ if self.config.use_lxml: return self._parse_with_lxml(html_content) else: return self._parse_with_builtin(html_content) def _parse_with_lxml(self, html_content: str) -> Any: """Parse HTML using lxml.""" try: # Use lxml's HTML parser which is more lenient return lxml_html.fromstring(html_content) except Exception as e: logger.warning(f"lxml parsing failed: {e}, falling back to html.parser") return self._parse_with_builtin(html_content) def _parse_with_builtin(self, html_content: str) -> "ParsedHTML": """Parse HTML using Python's built-in parser.""" parser = BuiltinHTMLParser() parser.feed(html_content) return ParsedHTML(parser.elements, html_content) class BuiltinHTMLParser(BaseHTMLParser): """Enhanced HTML parser using Python's built-in capabilities.""" def __init__(self): super().__init__() self.elements = [] self.current_element = None self.element_stack = [] def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]): """Handle opening tags.""" element = { "tag": tag, "attrs": dict(attrs), "text": "", "children": [], "start_pos": self.getpos(), } if self.element_stack: self.element_stack[-1]["children"].append(element) else: self.elements.append(element) self.element_stack.append(element) def handle_endtag(self, tag: str): """Handle closing tags.""" if self.element_stack and self.element_stack[-1]["tag"] == tag: element = self.element_stack.pop() element["end_pos"] = self.getpos() def handle_data(self, data: str): """Handle text content.""" if self.element_stack: self.element_stack[-1]["text"] += data class ParsedHTML: """Wrapper for parsed HTML using built-in parser.""" def __init__(self, elements: List[Dict], raw_html: str): self.elements = elements self.raw_html = raw_html def find_by_id(self, element_id: str) -> Optional[Dict]: """Find element by ID.""" return self._find_recursive( self.elements, lambda el: el["attrs"].get("id") == element_id ) def find_by_class(self, class_name: str) -> List[Dict]: """Find elements by class name.""" results = [] self._find_all_recursive( self.elements, lambda el: class_name in el["attrs"].get("class", "").split(), results, ) return results def find_by_tag(self, tag_name: str) -> List[Dict]: """Find elements by tag name.""" results = [] self._find_all_recursive( self.elements, lambda el: el["tag"].lower() == tag_name.lower(), results ) return results def _find_recursive(self, elements: List[Dict], condition) -> Optional[Dict]: """Recursively find first element matching condition.""" for element in elements: if condition(element): return element result = self._find_recursive(element["children"], condition) if result: return result return None def _find_all_recursive(self, elements: List[Dict], condition, results: List[Dict]): """Recursively find all elements matching condition.""" for element in elements: if condition(element): results.append(element) self._find_all_recursive(element["children"], condition, results) # Global parser instance _default_parser = HTMLParser() def extract_attributes(html_element: str) -> Dict[str, str]: """ Extract attributes from an HTML element string. Args: html_element: HTML element as string (e.g., '