')
-
- Returns:
- Dictionary of attribute name-value pairs
-
- Examples:
- >>> extract_attributes('
')
- {'class': 'test', 'id': 'main'}
- """
- if not html_element:
- return {}
-
- # Use regex to extract attributes from HTML string
- attr_pattern = r'(\w+)=(["\'])([^"\']*?)\2'
- matches = re.findall(attr_pattern, html_element)
-
- attributes = {}
- for match in matches:
- attr_name, _, attr_value = match
- attributes[attr_name] = attr_value
-
- # Handle attributes without quotes
- unquoted_pattern = r"(\w+)=([^\s>]+)"
- unquoted_matches = re.findall(unquoted_pattern, html_element)
- for attr_name, attr_value in unquoted_matches:
- if attr_name not in attributes:
- attributes[attr_name] = attr_value
-
- return attributes
-
-
-def get_element_by_id(element_id: str, html_content: str) -> Optional[str]:
- """
- Get HTML element by ID.
-
- Args:
- element_id: The ID attribute value to search for
- html_content: HTML content to search in
-
- Returns:
- HTML string of the element or None if not found
-
- Examples:
- >>> html = '
Content
'
- >>> get_element_by_id("test", html)
- '
Content
'
- """
- parsed = _default_parser.parse(html_content)
-
- if _default_parser.config.use_lxml and HAS_LXML:
- try:
- element = parsed.xpath(f'//*[@id="{element_id}"]')
- if element:
- return etree.tostring(element[0], encoding="unicode", method="html")
- except Exception as e:
- logger.warning(f"lxml XPath search failed: {e}")
- return None
- else:
- element = parsed.find_by_id(element_id)
- if element:
- return _element_to_html(element, html_content)
-
- return None
-
-
-def get_element_by_tag(tag_name: str, html_content: str) -> Optional[str]:
- """
- Get first HTML element by tag name.
-
- Args:
- tag_name: The tag name to search for
- html_content: HTML content to search in
-
- Returns:
- HTML string of the element or None if not found
- """
- parsed = _default_parser.parse(html_content)
-
- if _default_parser.config.use_lxml and HAS_LXML:
- try:
- elements = parsed.xpath(f"//{tag_name}")
- if elements:
- return etree.tostring(elements[0], encoding="unicode", method="html")
- except Exception as e:
- logger.warning(f"lxml XPath search failed: {e}")
- return None
- else:
- elements = parsed.find_by_tag(tag_name)
- if elements:
- return _element_to_html(elements[0], html_content)
-
- return None
-
-
-def get_element_by_class(class_name: str, html_content: str) -> Optional[str]:
- """
- Get first HTML element by class name.
-
- Args:
- class_name: The class name to search for
- html_content: HTML content to search in
-
- Returns:
- HTML string of the element or None if not found
- """
- parsed = _default_parser.parse(html_content)
-
- if _default_parser.config.use_lxml and HAS_LXML:
- try:
- elements = parsed.xpath(f'//*[contains(@class, "{class_name}")]')
- if elements:
- return etree.tostring(elements[0], encoding="unicode", method="html")
- except Exception as e:
- logger.warning(f"lxml XPath search failed: {e}")
- return None
- else:
- elements = parsed.find_by_class(class_name)
- if elements:
- return _element_to_html(elements[0], html_content)
-
- return None
-
-
-def get_elements_by_tag(tag_name: str, html_content: str) -> List[str]:
- """
- Get all HTML elements by tag name.
-
- Args:
- tag_name: The tag name to search for
- html_content: HTML content to search in
-
- Returns:
- List of HTML strings for matching elements
- """
- parsed = _default_parser.parse(html_content)
- results = []
-
- if _default_parser.config.use_lxml and HAS_LXML:
- try:
- elements = parsed.xpath(f"//{tag_name}")
- for element in elements:
- results.append(
- etree.tostring(element, encoding="unicode", method="html")
- )
- except Exception as e:
- logger.warning(f"lxml XPath search failed: {e}")
- else:
- elements = parsed.find_by_tag(tag_name)
- for element in elements:
- results.append(_element_to_html(element, html_content))
-
- return results
-
-
-def get_elements_by_class(class_name: str, html_content: str) -> List[str]:
- """
- Get all HTML elements by class name.
-
- Args:
- class_name: The class name to search for
- html_content: HTML content to search in
-
- Returns:
- List of HTML strings for matching elements
- """
- parsed = _default_parser.parse(html_content)
- results = []
-
- if _default_parser.config.use_lxml and HAS_LXML:
- try:
- elements = parsed.xpath(f'//*[contains(@class, "{class_name}")]')
- for element in elements:
- results.append(
- etree.tostring(element, encoding="unicode", method="html")
- )
- except Exception as e:
- logger.warning(f"lxml XPath search failed: {e}")
- else:
- elements = parsed.find_by_class(class_name)
- for element in elements:
- results.append(_element_to_html(element, html_content))
-
- return results
-
-
-def get_elements_html_by_class(class_name: str, html_content: str) -> List[str]:
- """
- Get HTML strings of elements by class name.
-
- This is an alias for get_elements_by_class for yt-dlp compatibility.
-
- Args:
- class_name: The class name to search for
- html_content: HTML content to search in
-
- Returns:
- List of HTML strings for matching elements
- """
- return get_elements_by_class(class_name, html_content)
-
-
-def get_element_text_and_html_by_tag(
- tag_name: str, html_content: str
-) -> Tuple[Optional[str], Optional[str]]:
- """
- Get both text content and HTML of first element by tag name.
-
- Args:
- tag_name: The tag name to search for
- html_content: HTML content to search in
-
- Returns:
- Tuple of (text_content, html_string) or (None, None) if not found
-
- Examples:
- >>> html = ''
- >>> get_element_text_and_html_by_tag("script", html)
- ('alert("test");', '')
- """
- parsed = _default_parser.parse(html_content)
-
- if _default_parser.config.use_lxml and HAS_LXML:
- try:
- elements = parsed.xpath(f"//{tag_name}")
- if elements:
- element = elements[0]
- text = (
- element.text_content()
- if hasattr(element, "text_content")
- else (element.text or "")
- )
- html_str = etree.tostring(element, encoding="unicode", method="html")
- return text, html_str
- except Exception as e:
- logger.warning(f"lxml XPath search failed: {e}")
- return None, None
- else:
- elements = parsed.find_by_tag(tag_name)
- if elements:
- element = elements[0]
- text = _extract_text_content(element)
- html_str = _element_to_html(element, html_content)
- return text, html_str
-
- return None, None
-
-
-def _element_to_html(element: Dict, original_html: str) -> str:
- """
- Convert parsed element back to HTML string.
-
- This is a simplified implementation that reconstructs HTML from parsed data.
- For production use, consider using lxml for better accuracy.
- """
- if not element:
- return ""
-
- # Build opening tag
- tag = element["tag"]
- attrs = element.get("attrs", {})
- attr_str = " ".join(f'{k}="{v}"' for k, v in attrs.items() if v is not None)
-
- if attr_str:
- opening_tag = f"<{tag} {attr_str}>"
- else:
- opening_tag = f"<{tag}>"
-
- # Add text content
- text = element.get("text", "")
-
- # Add children
- children_html = ""
- for child in element.get("children", []):
- children_html += _element_to_html(child, original_html)
-
- # Build closing tag
- closing_tag = f"{tag}>"
-
- return f"{opening_tag}{text}{children_html}{closing_tag}"
-
-
-def _extract_text_content(element: Dict) -> str:
- """Extract all text content from element and its children."""
- text = element.get("text", "")
-
- for child in element.get("children", []):
- text += _extract_text_content(child)
-
- return text
-
-
-def configure_parser(use_lxml: Optional[bool] = None) -> None:
- """
- Configure the global HTML parser.
-
- Args:
- use_lxml: Force use of lxml (True), html.parser (False), or auto-detect (None)
- """
- global _default_parser
- _default_parser = HTMLParser(HTMLParserConfig(use_lxml))
- logger.info(
- f"HTML parser configured: {'lxml' if _default_parser.config.use_lxml else 'html.parser'}"
- )
diff --git a/viu_media/libs/provider/scraping/user_agents.py b/viu_media/libs/provider/scraping/user_agents.py
deleted file mode 100644
index 4637aae..0000000
--- a/viu_media/libs/provider/scraping/user_agents.py
+++ /dev/null
@@ -1,235 +0,0 @@
-"""
-User agent utilities for web scraping.
-
-Provides functionality to generate random user agent strings
-to avoid detection and blocking by websites.
-"""
-
-import random
-from typing import List, Optional
-
-
-class UserAgentGenerator:
- """
- Generator for realistic user agent strings.
-
- Provides a variety of common user agents from different browsers
- and operating systems to help avoid detection.
- """
-
- # Common user agents for different browsers and OS combinations
- USER_AGENTS = [
- # Chrome on Windows
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
- # Chrome on macOS
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
- # Chrome on Linux
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
- # Firefox on Windows
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
- # Firefox on macOS
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:122.0) Gecko/20100101 Firefox/122.0",
- # Firefox on Linux
- "Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
- "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
- # Safari on macOS
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
- # Edge on Windows
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
- # Mobile Chrome (Android)
- "Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
- "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
- # Mobile Safari (iOS)
- "Mozilla/5.0 (iPhone; CPU iPhone OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1",
- "Mozilla/5.0 (iPad; CPU OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1",
- ]
-
- # Browser-specific user agents for when you need a specific browser
- CHROME_USER_AGENTS = [
- ua for ua in USER_AGENTS if "Chrome" in ua and "Edg" not in ua
- ]
- FIREFOX_USER_AGENTS = [ua for ua in USER_AGENTS if "Firefox" in ua]
- SAFARI_USER_AGENTS = [
- ua for ua in USER_AGENTS if "Safari" in ua and "Chrome" not in ua
- ]
- EDGE_USER_AGENTS = [ua for ua in USER_AGENTS if "Edg" in ua]
-
- # Platform-specific user agents
- WINDOWS_USER_AGENTS = [ua for ua in USER_AGENTS if "Windows NT" in ua]
- MACOS_USER_AGENTS = [ua for ua in USER_AGENTS if "Macintosh" in ua]
- LINUX_USER_AGENTS = [
- ua for ua in USER_AGENTS if "Linux" in ua and "Android" not in ua
- ]
- MOBILE_USER_AGENTS = [ua for ua in USER_AGENTS if "Mobile" in ua or "Android" in ua]
-
- def __init__(self, seed: Optional[int] = None):
- """
- Initialize the user agent generator.
-
- Args:
- seed: Random seed for reproducible results (optional)
- """
- if seed is not None:
- random.seed(seed)
-
- def random(self) -> str:
- """
- Get a random user agent string.
-
- Returns:
- Random user agent string
- """
- return random.choice(self.USER_AGENTS)
-
- def random_browser(self, browser: str) -> str:
- """
- Get a random user agent for a specific browser.
-
- Args:
- browser: Browser name ('chrome', 'firefox', 'safari', 'edge')
-
- Returns:
- Random user agent string for the specified browser
-
- Raises:
- ValueError: If browser is not supported
- """
- browser = browser.lower()
- if browser == "chrome":
- return random.choice(self.CHROME_USER_AGENTS)
- elif browser == "firefox":
- return random.choice(self.FIREFOX_USER_AGENTS)
- elif browser == "safari":
- return random.choice(self.SAFARI_USER_AGENTS)
- elif browser == "edge":
- return random.choice(self.EDGE_USER_AGENTS)
- else:
- raise ValueError(f"Unsupported browser: {browser}")
-
- def random_platform(self, platform: str) -> str:
- """
- Get a random user agent for a specific platform.
-
- Args:
- platform: Platform name ('windows', 'macos', 'linux', 'mobile')
-
- Returns:
- Random user agent string for the specified platform
-
- Raises:
- ValueError: If platform is not supported
- """
- platform = platform.lower()
- if platform == "windows":
- return random.choice(self.WINDOWS_USER_AGENTS)
- elif platform in ("macos", "mac"):
- return random.choice(self.MACOS_USER_AGENTS)
- elif platform == "linux":
- return random.choice(self.LINUX_USER_AGENTS)
- elif platform == "mobile":
- return random.choice(self.MOBILE_USER_AGENTS)
- else:
- raise ValueError(f"Unsupported platform: {platform}")
-
- def add_user_agent(self, user_agent: str) -> None:
- """
- Add a custom user agent to the list.
-
- Args:
- user_agent: Custom user agent string to add
- """
- if user_agent not in self.USER_AGENTS:
- self.USER_AGENTS.append(user_agent)
-
- def get_all(self) -> List[str]:
- """
- Get all available user agent strings.
-
- Returns:
- List of all user agent strings
- """
- return self.USER_AGENTS.copy()
-
-
-# Global instance for convenience
-_default_generator = UserAgentGenerator()
-
-
-def random_user_agent() -> str:
- """
- Get a random user agent string using the default generator.
-
- Returns:
- Random user agent string
-
- Examples:
- >>> ua = random_user_agent()
- >>> "Mozilla" in ua
- True
- """
- return _default_generator.random()
-
-
-def random_user_agent_browser(browser: str) -> str:
- """
- Get a random user agent for a specific browser.
-
- Args:
- browser: Browser name ('chrome', 'firefox', 'safari', 'edge')
-
- Returns:
- Random user agent string for the specified browser
- """
- return _default_generator.random_browser(browser)
-
-
-def random_user_agent_platform(platform: str) -> str:
- """
- Get a random user agent for a specific platform.
-
- Args:
- platform: Platform name ('windows', 'macos', 'linux', 'mobile')
-
- Returns:
- Random user agent string for the specified platform
- """
- return _default_generator.random_platform(platform)
-
-
-def set_user_agent_seed(seed: int) -> None:
- """
- Set the random seed for user agent generation.
-
- Args:
- seed: Random seed value
- """
- global _default_generator
- _default_generator = UserAgentGenerator(seed)
-
-
-def add_custom_user_agent(user_agent: str) -> None:
- """
- Add a custom user agent to the default generator.
-
- Args:
- user_agent: Custom user agent string to add
- """
- _default_generator.add_user_agent(user_agent)
-
-
-def get_all_user_agents() -> List[str]:
- """
- Get all available user agent strings from the default generator.
-
- Returns:
- List of all user agent strings
- """
- return _default_generator.get_all()
diff --git a/viu_media/libs/provider/scraping/utils.py b/viu_media/libs/provider/scraping/utils.py
deleted file mode 100644
index b241bcd..0000000
--- a/viu_media/libs/provider/scraping/utils.py
+++ /dev/null
@@ -1,272 +0,0 @@
-"""
-Encoding and utility functions for web scraping.
-
-Provides various encoding utilities including base-N encoding
-that was previously sourced from yt-dlp.
-"""
-
-import string
-from typing import Optional
-
-
-def encode_base_n(num: int, n: int, table: Optional[str] = None) -> str:
- """
- Encode a number in base-n representation.
-
- Args:
- num: The number to encode
- n: The base to use for encoding
- table: Custom character table (optional)
-
- Returns:
- String representation of the number in base-n
-
- Examples:
- >>> encode_base_n(255, 16)
- 'ff'
- >>> encode_base_n(42, 36)
- '16'
- """
- if table is None:
- # Default table: 0-9, a-z
- table = string.digits + string.ascii_lowercase
-
- if not 2 <= n <= len(table):
- raise ValueError(f"Base must be between 2 and {len(table)}")
-
- if num == 0:
- return table[0]
-
- result = []
- is_negative = num < 0
- num = abs(num)
-
- while num > 0:
- result.append(table[num % n])
- num //= n
-
- if is_negative:
- result.append("-")
-
- return "".join(reversed(result))
-
-
-def decode_base_n(encoded: str, n: int, table: Optional[str] = None) -> int:
- """
- Decode a base-n encoded string back to an integer.
-
- Args:
- encoded: The base-n encoded string
- n: The base used for encoding
- table: Custom character table (optional)
-
- Returns:
- The decoded integer
-
- Examples:
- >>> decode_base_n('ff', 16)
- 255
- >>> decode_base_n('16', 36)
- 42
- """
- if table is None:
- table = string.digits + string.ascii_lowercase
-
- if not 2 <= n <= len(table):
- raise ValueError(f"Base must be between 2 and {len(table)}")
-
- if not encoded:
- return 0
-
- is_negative = encoded.startswith("-")
- if is_negative:
- encoded = encoded[1:]
-
- result = 0
- for i, char in enumerate(reversed(encoded.lower())):
- if char not in table:
- raise ValueError(f"Invalid character '{char}' for base {n}")
-
- digit_value = table.index(char)
- if digit_value >= n:
- raise ValueError(f"Invalid digit '{char}' for base {n}")
-
- result += digit_value * (n**i)
-
- return -result if is_negative else result
-
-
-def url_encode(text: str, safe: str = "") -> str:
- """
- URL encode a string.
-
- Args:
- text: Text to encode
- safe: Characters that should not be encoded
-
- Returns:
- URL encoded string
- """
- import urllib.parse
-
- return urllib.parse.quote(text, safe=safe)
-
-
-def url_decode(text: str) -> str:
- """
- URL decode a string.
-
- Args:
- text: URL encoded text to decode
-
- Returns:
- Decoded string
- """
- import urllib.parse
-
- return urllib.parse.unquote(text)
-
-
-def html_unescape(text: str) -> str:
- """
- Unescape HTML entities in text.
-
- Args:
- text: Text containing HTML entities
-
- Returns:
- Text with HTML entities unescaped
-
- Examples:
- >>> html_unescape('"Hello" & <World>')
- '"Hello" &
'
- """
- import html
-
- return html.unescape(text)
-
-
-def strip_tags(html_content: str) -> str:
- """
- Remove all HTML tags from content, leaving only text.
-
- Args:
- html_content: HTML content with tags
-
- Returns:
- Plain text with tags removed
-
- Examples:
- >>> strip_tags('Hello world!
')
- 'Hello world!'
- """
- import re
-
- return re.sub(r"<[^>]+>", "", html_content)
-
-
-def normalize_whitespace(text: str) -> str:
- """
- Normalize whitespace in text by collapsing multiple spaces and removing leading/trailing whitespace.
-
- Args:
- text: Text to normalize
-
- Returns:
- Text with normalized whitespace
-
- Examples:
- >>> normalize_whitespace(' Hello world \\n\\t ')
- 'Hello world'
- """
- import re
-
- return re.sub(r"\s+", " ", text.strip())
-
-
-def extract_domain(url: str) -> str:
- """
- Extract domain from a URL.
-
- Args:
- url: Full URL
-
- Returns:
- Domain portion of the URL
-
- Examples:
- >>> extract_domain('https://example.com/path?query=1')
- 'example.com'
- """
- import urllib.parse
-
- parsed = urllib.parse.urlparse(url)
- return parsed.netloc
-
-
-def join_url(base: str, path: str) -> str:
- """
- Join a base URL with a path.
-
- Args:
- base: Base URL
- path: Path to join
-
- Returns:
- Combined URL
-
- Examples:
- >>> join_url('https://example.com', '/api/data')
- 'https://example.com/api/data'
- """
- import urllib.parse
-
- return urllib.parse.urljoin(base, path)
-
-
-def parse_query_string(query: str) -> dict:
- """
- Parse a query string into a dictionary.
-
- Args:
- query: Query string (with or without leading '?')
-
- Returns:
- Dictionary of query parameters
-
- Examples:
- >>> parse_query_string('?name=John&age=30')
- {'name': ['John'], 'age': ['30']}
- """
- import urllib.parse
-
- if query.startswith("?"):
- query = query[1:]
- return urllib.parse.parse_qs(query)
-
-
-def build_query_string(params: dict) -> str:
- """
- Build a query string from a dictionary of parameters.
-
- Args:
- params: Dictionary of parameters
-
- Returns:
- URL-encoded query string
-
- Examples:
- >>> build_query_string({'name': 'John', 'age': 30})
- 'name=John&age=30'
- """
- import urllib.parse
-
- # Handle both single values and lists
- normalized_params = {}
- for key, value in params.items():
- if isinstance(value, (list, tuple)):
- normalized_params[key] = value
- else:
- normalized_params[key] = [str(value)]
-
- return urllib.parse.urlencode(normalized_params, doseq=True)