mirror of
https://github.com/Benexl/FastAnime.git
synced 2025-12-05 20:40:09 -08:00
feat: refactor provider imports and enhance HTML parsing utilities
This commit is contained in:
2
fa
2
fa
@@ -3,4 +3,4 @@ provider_type=$1
|
||||
provider_name=$2
|
||||
[ -z "$provider_type" ] && echo "Please specify provider type" && exit
|
||||
[ -z "$provider_name" ] && echo "Please specify provider type" && exit
|
||||
uv run python -m fastanime.libs.providers.${provider_type}.${provider_name}.provider
|
||||
uv run python -m fastanime.libs.provider.${provider_type}.${provider_name}.provider
|
||||
|
||||
@@ -2,7 +2,7 @@ import re
|
||||
|
||||
|
||||
def animepahe_key_creator(c: int, a: int):
|
||||
from yt_dlp.utils import encode_base_n
|
||||
from ...scraping.utils import encode_base_n
|
||||
|
||||
if c < a:
|
||||
val_a = ""
|
||||
@@ -37,17 +37,18 @@ ENCODE_JS_REGEX = re.compile(r"'(.*?);',(\d+),(\d+),'(.*)'\.split")
|
||||
|
||||
|
||||
def process_animepahe_embed_page(embed_page: str):
|
||||
from yt_dlp.utils import get_element_text_and_html_by_tag
|
||||
from ...scraping.html_parser import get_element_text_and_html_by_tag
|
||||
|
||||
encoded_js_string = ""
|
||||
embed_page_content = embed_page
|
||||
for _ in range(8):
|
||||
text, html = get_element_text_and_html_by_tag("script", embed_page_content)
|
||||
if not text:
|
||||
if not text and html:
|
||||
embed_page_content = re.sub(html, "", embed_page_content)
|
||||
continue
|
||||
encoded_js_string = text.strip()
|
||||
break
|
||||
if text:
|
||||
encoded_js_string = text.strip()
|
||||
break
|
||||
if not encoded_js_string:
|
||||
return
|
||||
obsfucated_js_parameter_match = PARAMETERS_REGEX.search(encoded_js_string)
|
||||
|
||||
@@ -106,8 +106,7 @@ class AnimePahe(BaseAnimeProvider):
|
||||
|
||||
@debug_provider
|
||||
def episode_streams(self, params: EpisodeStreamsParams) -> Iterator[Server] | None:
|
||||
# TODO: replace with custom implementations using default html parser or lxml
|
||||
from yt_dlp.utils import (
|
||||
from ...scraping.html_parser import (
|
||||
extract_attributes,
|
||||
get_element_by_id,
|
||||
get_elements_html_by_class,
|
||||
@@ -125,6 +124,9 @@ class AnimePahe(BaseAnimeProvider):
|
||||
response.raise_for_status()
|
||||
|
||||
c = get_element_by_id("resolutionMenu", response.text)
|
||||
if not c:
|
||||
logger.error("Resolution menu not found in the response")
|
||||
return
|
||||
resolutionMenuItems = get_elements_html_by_class("dropdown-item", c)
|
||||
res_dicts = [extract_attributes(item) for item in resolutionMenuItems]
|
||||
quality = None
|
||||
@@ -133,8 +135,9 @@ class AnimePahe(BaseAnimeProvider):
|
||||
|
||||
# TODO: better document the scraping process
|
||||
for res_dict in res_dicts:
|
||||
embed_url = res_dict["data-src"]
|
||||
data_audio = "dub" if res_dict["data-audio"] == "eng" else "sub"
|
||||
# the actual attributes are data attributes in the original html 'prefixed with data-'
|
||||
embed_url = res_dict["src"]
|
||||
data_audio = "dub" if res_dict["audio"] == "eng" else "sub"
|
||||
|
||||
if data_audio != params.translation_type:
|
||||
continue
|
||||
@@ -162,7 +165,7 @@ class AnimePahe(BaseAnimeProvider):
|
||||
logger.error("failed to find juicy stream")
|
||||
continue
|
||||
juicy_stream = juicy_stream.group(1)
|
||||
quality = res_dict["data-resolution"]
|
||||
quality = res_dict["resolution"]
|
||||
translation_type = data_audio
|
||||
stream_link = juicy_stream
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ def test_anime_provider(AnimeProvider: Type[BaseAnimeProvider]):
|
||||
anime_provider = AnimeProvider(
|
||||
Client(headers={"User-Agent": random_user_agent(), **AnimeProvider.HEADERS})
|
||||
)
|
||||
print(APP_ASCII_ART)
|
||||
print(APP_ASCII_ART.read_text(encoding="utf-8"))
|
||||
query = input("What anime would you like to stream: ")
|
||||
search_results = anime_provider.search(SearchParams(query=query))
|
||||
if not search_results:
|
||||
|
||||
0
fastanime/libs/provider/scraping/__init__.py
Normal file
0
fastanime/libs/provider/scraping/__init__.py
Normal file
474
fastanime/libs/provider/scraping/html_parser.py
Normal file
474
fastanime/libs/provider/scraping/html_parser.py
Normal file
@@ -0,0 +1,474 @@
|
||||
"""
|
||||
HTML parsing utilities with optional lxml support.
|
||||
|
||||
This module provides comprehensive HTML parsing capabilities using either
|
||||
Python's built-in html.parser or lxml for better performance when available.
|
||||
"""
|
||||
# TODO: Review and optimize the HTML parsing logic for better performance and flexibility.
|
||||
# Consider adding more utility functions for common HTML manipulation tasks.
|
||||
import logging
|
||||
import re
|
||||
from html.parser import HTMLParser as BaseHTMLParser
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import lxml
|
||||
HAS_LXML = False
|
||||
try:
|
||||
from lxml import etree, html as lxml_html
|
||||
HAS_LXML = True
|
||||
logger.debug("lxml is available and will be used for HTML parsing")
|
||||
except ImportError:
|
||||
logger.debug("lxml not available, falling back to html.parser")
|
||||
|
||||
|
||||
class HTMLParserConfig:
|
||||
"""Configuration for HTML parser selection."""
|
||||
|
||||
def __init__(self, use_lxml: Optional[bool] = None):
|
||||
"""
|
||||
Initialize parser configuration.
|
||||
|
||||
Args:
|
||||
use_lxml: Force use of lxml (True), html.parser (False), or auto-detect (None)
|
||||
"""
|
||||
if use_lxml is None:
|
||||
self.use_lxml = HAS_LXML
|
||||
else:
|
||||
self.use_lxml = use_lxml and HAS_LXML
|
||||
|
||||
if use_lxml and not HAS_LXML:
|
||||
logger.warning("lxml requested but not available, falling back to html.parser")
|
||||
|
||||
|
||||
class HTMLParser:
|
||||
"""
|
||||
Comprehensive HTML parser with optional lxml support.
|
||||
|
||||
Provides a unified interface for HTML parsing operations regardless
|
||||
of the underlying parser implementation.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[HTMLParserConfig] = None):
|
||||
"""Initialize the HTML parser with configuration."""
|
||||
self.config = config or HTMLParserConfig()
|
||||
|
||||
def parse(self, html_content: str) -> Union[etree._Element, 'ParsedHTML']:
|
||||
"""
|
||||
Parse HTML content and return a parsed tree.
|
||||
|
||||
Args:
|
||||
html_content: Raw HTML string to parse
|
||||
|
||||
Returns:
|
||||
Parsed HTML tree (lxml Element or custom ParsedHTML object)
|
||||
"""
|
||||
if self.config.use_lxml:
|
||||
return self._parse_with_lxml(html_content)
|
||||
else:
|
||||
return self._parse_with_builtin(html_content)
|
||||
|
||||
def _parse_with_lxml(self, html_content: str) -> etree._Element:
|
||||
"""Parse HTML using lxml."""
|
||||
try:
|
||||
# Use lxml's HTML parser which is more lenient
|
||||
return lxml_html.fromstring(html_content)
|
||||
except Exception as e:
|
||||
logger.warning(f"lxml parsing failed: {e}, falling back to html.parser")
|
||||
return self._parse_with_builtin(html_content)
|
||||
|
||||
def _parse_with_builtin(self, html_content: str) -> 'ParsedHTML':
|
||||
"""Parse HTML using Python's built-in parser."""
|
||||
parser = BuiltinHTMLParser()
|
||||
parser.feed(html_content)
|
||||
return ParsedHTML(parser.elements, html_content)
|
||||
|
||||
|
||||
class BuiltinHTMLParser(BaseHTMLParser):
|
||||
"""Enhanced HTML parser using Python's built-in capabilities."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.elements = []
|
||||
self.current_element = None
|
||||
self.element_stack = []
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
|
||||
"""Handle opening tags."""
|
||||
element = {
|
||||
'tag': tag,
|
||||
'attrs': dict(attrs),
|
||||
'text': '',
|
||||
'children': [],
|
||||
'start_pos': self.getpos(),
|
||||
}
|
||||
|
||||
if self.element_stack:
|
||||
self.element_stack[-1]['children'].append(element)
|
||||
else:
|
||||
self.elements.append(element)
|
||||
|
||||
self.element_stack.append(element)
|
||||
|
||||
def handle_endtag(self, tag: str):
|
||||
"""Handle closing tags."""
|
||||
if self.element_stack and self.element_stack[-1]['tag'] == tag:
|
||||
element = self.element_stack.pop()
|
||||
element['end_pos'] = self.getpos()
|
||||
|
||||
def handle_data(self, data: str):
|
||||
"""Handle text content."""
|
||||
if self.element_stack:
|
||||
self.element_stack[-1]['text'] += data
|
||||
|
||||
|
||||
class ParsedHTML:
|
||||
"""Wrapper for parsed HTML using built-in parser."""
|
||||
|
||||
def __init__(self, elements: List[Dict], raw_html: str):
|
||||
self.elements = elements
|
||||
self.raw_html = raw_html
|
||||
|
||||
def find_by_id(self, element_id: str) -> Optional[Dict]:
|
||||
"""Find element by ID."""
|
||||
return self._find_recursive(self.elements, lambda el: el['attrs'].get('id') == element_id)
|
||||
|
||||
def find_by_class(self, class_name: str) -> List[Dict]:
|
||||
"""Find elements by class name."""
|
||||
results = []
|
||||
self._find_all_recursive(
|
||||
self.elements,
|
||||
lambda el: class_name in el['attrs'].get('class', '').split(),
|
||||
results
|
||||
)
|
||||
return results
|
||||
|
||||
def find_by_tag(self, tag_name: str) -> List[Dict]:
|
||||
"""Find elements by tag name."""
|
||||
results = []
|
||||
self._find_all_recursive(
|
||||
self.elements,
|
||||
lambda el: el['tag'].lower() == tag_name.lower(),
|
||||
results
|
||||
)
|
||||
return results
|
||||
|
||||
def _find_recursive(self, elements: List[Dict], condition) -> Optional[Dict]:
|
||||
"""Recursively find first element matching condition."""
|
||||
for element in elements:
|
||||
if condition(element):
|
||||
return element
|
||||
result = self._find_recursive(element['children'], condition)
|
||||
if result:
|
||||
return result
|
||||
return None
|
||||
|
||||
def _find_all_recursive(self, elements: List[Dict], condition, results: List[Dict]):
|
||||
"""Recursively find all elements matching condition."""
|
||||
for element in elements:
|
||||
if condition(element):
|
||||
results.append(element)
|
||||
self._find_all_recursive(element['children'], condition, results)
|
||||
|
||||
|
||||
# Global parser instance
|
||||
_default_parser = HTMLParser()
|
||||
|
||||
|
||||
def extract_attributes(html_element: str) -> Dict[str, str]:
|
||||
"""
|
||||
Extract attributes from an HTML element string.
|
||||
|
||||
Args:
|
||||
html_element: HTML element as string (e.g., '<div class="test" id="main">')
|
||||
|
||||
Returns:
|
||||
Dictionary of attribute name-value pairs
|
||||
|
||||
Examples:
|
||||
>>> extract_attributes('<div class="test" id="main">')
|
||||
{'class': 'test', 'id': 'main'}
|
||||
"""
|
||||
if not html_element:
|
||||
return {}
|
||||
|
||||
# Use regex to extract attributes from HTML string
|
||||
attr_pattern = r'(\w+)=(["\'])([^"\']*?)\2'
|
||||
matches = re.findall(attr_pattern, html_element)
|
||||
|
||||
attributes = {}
|
||||
for match in matches:
|
||||
attr_name, _, attr_value = match
|
||||
attributes[attr_name] = attr_value
|
||||
|
||||
# Handle attributes without quotes
|
||||
unquoted_pattern = r'(\w+)=([^\s>]+)'
|
||||
unquoted_matches = re.findall(unquoted_pattern, html_element)
|
||||
for attr_name, attr_value in unquoted_matches:
|
||||
if attr_name not in attributes:
|
||||
attributes[attr_name] = attr_value
|
||||
|
||||
return attributes
|
||||
|
||||
|
||||
def get_element_by_id(element_id: str, html_content: str) -> Optional[str]:
|
||||
"""
|
||||
Get HTML element by ID.
|
||||
|
||||
Args:
|
||||
element_id: The ID attribute value to search for
|
||||
html_content: HTML content to search in
|
||||
|
||||
Returns:
|
||||
HTML string of the element or None if not found
|
||||
|
||||
Examples:
|
||||
>>> html = '<div id="test">Content</div>'
|
||||
>>> get_element_by_id("test", html)
|
||||
'<div id="test">Content</div>'
|
||||
"""
|
||||
parsed = _default_parser.parse(html_content)
|
||||
|
||||
if _default_parser.config.use_lxml:
|
||||
try:
|
||||
element = parsed.xpath(f'//*[@id="{element_id}"]')
|
||||
if element:
|
||||
return etree.tostring(element[0], encoding='unicode', method='html')
|
||||
except Exception as e:
|
||||
logger.warning(f"lxml XPath search failed: {e}")
|
||||
return None
|
||||
else:
|
||||
element = parsed.find_by_id(element_id)
|
||||
if element:
|
||||
return _element_to_html(element, html_content)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_element_by_tag(tag_name: str, html_content: str) -> Optional[str]:
|
||||
"""
|
||||
Get first HTML element by tag name.
|
||||
|
||||
Args:
|
||||
tag_name: The tag name to search for
|
||||
html_content: HTML content to search in
|
||||
|
||||
Returns:
|
||||
HTML string of the element or None if not found
|
||||
"""
|
||||
parsed = _default_parser.parse(html_content)
|
||||
|
||||
if _default_parser.config.use_lxml:
|
||||
try:
|
||||
elements = parsed.xpath(f'//{tag_name}')
|
||||
if elements:
|
||||
return etree.tostring(elements[0], encoding='unicode', method='html')
|
||||
except Exception as e:
|
||||
logger.warning(f"lxml XPath search failed: {e}")
|
||||
return None
|
||||
else:
|
||||
elements = parsed.find_by_tag(tag_name)
|
||||
if elements:
|
||||
return _element_to_html(elements[0], html_content)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_element_by_class(class_name: str, html_content: str) -> Optional[str]:
|
||||
"""
|
||||
Get first HTML element by class name.
|
||||
|
||||
Args:
|
||||
class_name: The class name to search for
|
||||
html_content: HTML content to search in
|
||||
|
||||
Returns:
|
||||
HTML string of the element or None if not found
|
||||
"""
|
||||
parsed = _default_parser.parse(html_content)
|
||||
|
||||
if _default_parser.config.use_lxml:
|
||||
try:
|
||||
elements = parsed.xpath(f'//*[contains(@class, "{class_name}")]')
|
||||
if elements:
|
||||
return etree.tostring(elements[0], encoding='unicode', method='html')
|
||||
except Exception as e:
|
||||
logger.warning(f"lxml XPath search failed: {e}")
|
||||
return None
|
||||
else:
|
||||
elements = parsed.find_by_class(class_name)
|
||||
if elements:
|
||||
return _element_to_html(elements[0], html_content)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_elements_by_tag(tag_name: str, html_content: str) -> List[str]:
|
||||
"""
|
||||
Get all HTML elements by tag name.
|
||||
|
||||
Args:
|
||||
tag_name: The tag name to search for
|
||||
html_content: HTML content to search in
|
||||
|
||||
Returns:
|
||||
List of HTML strings for matching elements
|
||||
"""
|
||||
parsed = _default_parser.parse(html_content)
|
||||
results = []
|
||||
|
||||
if _default_parser.config.use_lxml:
|
||||
try:
|
||||
elements = parsed.xpath(f'//{tag_name}')
|
||||
for element in elements:
|
||||
results.append(etree.tostring(element, encoding='unicode', method='html'))
|
||||
except Exception as e:
|
||||
logger.warning(f"lxml XPath search failed: {e}")
|
||||
else:
|
||||
elements = parsed.find_by_tag(tag_name)
|
||||
for element in elements:
|
||||
results.append(_element_to_html(element, html_content))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_elements_by_class(class_name: str, html_content: str) -> List[str]:
|
||||
"""
|
||||
Get all HTML elements by class name.
|
||||
|
||||
Args:
|
||||
class_name: The class name to search for
|
||||
html_content: HTML content to search in
|
||||
|
||||
Returns:
|
||||
List of HTML strings for matching elements
|
||||
"""
|
||||
parsed = _default_parser.parse(html_content)
|
||||
results = []
|
||||
|
||||
if _default_parser.config.use_lxml:
|
||||
try:
|
||||
elements = parsed.xpath(f'//*[contains(@class, "{class_name}")]')
|
||||
for element in elements:
|
||||
results.append(etree.tostring(element, encoding='unicode', method='html'))
|
||||
except Exception as e:
|
||||
logger.warning(f"lxml XPath search failed: {e}")
|
||||
else:
|
||||
elements = parsed.find_by_class(class_name)
|
||||
for element in elements:
|
||||
results.append(_element_to_html(element, html_content))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_elements_html_by_class(class_name: str, html_content: str) -> List[str]:
|
||||
"""
|
||||
Get HTML strings of elements by class name.
|
||||
|
||||
This is an alias for get_elements_by_class for yt-dlp compatibility.
|
||||
|
||||
Args:
|
||||
class_name: The class name to search for
|
||||
html_content: HTML content to search in
|
||||
|
||||
Returns:
|
||||
List of HTML strings for matching elements
|
||||
"""
|
||||
return get_elements_by_class(class_name, html_content)
|
||||
|
||||
|
||||
def get_element_text_and_html_by_tag(tag_name: str, html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Get both text content and HTML of first element by tag name.
|
||||
|
||||
Args:
|
||||
tag_name: The tag name to search for
|
||||
html_content: HTML content to search in
|
||||
|
||||
Returns:
|
||||
Tuple of (text_content, html_string) or (None, None) if not found
|
||||
|
||||
Examples:
|
||||
>>> html = '<script>alert("test");</script>'
|
||||
>>> get_element_text_and_html_by_tag("script", html)
|
||||
('alert("test");', '<script>alert("test");</script>')
|
||||
"""
|
||||
parsed = _default_parser.parse(html_content)
|
||||
|
||||
if _default_parser.config.use_lxml:
|
||||
try:
|
||||
elements = parsed.xpath(f'//{tag_name}')
|
||||
if elements:
|
||||
element = elements[0]
|
||||
text = element.text_content() if hasattr(element, 'text_content') else (element.text or '')
|
||||
html_str = etree.tostring(element, encoding='unicode', method='html')
|
||||
return text, html_str
|
||||
except Exception as e:
|
||||
logger.warning(f"lxml XPath search failed: {e}")
|
||||
return None, None
|
||||
else:
|
||||
elements = parsed.find_by_tag(tag_name)
|
||||
if elements:
|
||||
element = elements[0]
|
||||
text = _extract_text_content(element)
|
||||
html_str = _element_to_html(element, html_content)
|
||||
return text, html_str
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def _element_to_html(element: Dict, original_html: str) -> str:
|
||||
"""
|
||||
Convert parsed element back to HTML string.
|
||||
|
||||
This is a simplified implementation that reconstructs HTML from parsed data.
|
||||
For production use, consider using lxml for better accuracy.
|
||||
"""
|
||||
if not element:
|
||||
return ""
|
||||
|
||||
# Build opening tag
|
||||
tag = element['tag']
|
||||
attrs = element.get('attrs', {})
|
||||
attr_str = ' '.join(f'{k}="{v}"' for k, v in attrs.items() if v is not None)
|
||||
|
||||
if attr_str:
|
||||
opening_tag = f"<{tag} {attr_str}>"
|
||||
else:
|
||||
opening_tag = f"<{tag}>"
|
||||
|
||||
# Add text content
|
||||
text = element.get('text', '')
|
||||
|
||||
# Add children
|
||||
children_html = ""
|
||||
for child in element.get('children', []):
|
||||
children_html += _element_to_html(child, original_html)
|
||||
|
||||
# Build closing tag
|
||||
closing_tag = f"</{tag}>"
|
||||
|
||||
return f"{opening_tag}{text}{children_html}{closing_tag}"
|
||||
|
||||
|
||||
def _extract_text_content(element: Dict) -> str:
|
||||
"""Extract all text content from element and its children."""
|
||||
text = element.get('text', '')
|
||||
|
||||
for child in element.get('children', []):
|
||||
text += _extract_text_content(child)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def configure_parser(use_lxml: Optional[bool] = None) -> None:
|
||||
"""
|
||||
Configure the global HTML parser.
|
||||
|
||||
Args:
|
||||
use_lxml: Force use of lxml (True), html.parser (False), or auto-detect (None)
|
||||
"""
|
||||
global _default_parser
|
||||
_default_parser = HTMLParser(HTMLParserConfig(use_lxml))
|
||||
logger.info(f"HTML parser configured: {'lxml' if _default_parser.config.use_lxml else 'html.parser'}")
|
||||
238
fastanime/libs/provider/scraping/user_agents.py
Normal file
238
fastanime/libs/provider/scraping/user_agents.py
Normal file
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
User agent utilities for web scraping.
|
||||
|
||||
Provides functionality to generate random user agent strings
|
||||
to avoid detection and blocking by websites.
|
||||
"""
|
||||
|
||||
import random
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class UserAgentGenerator:
|
||||
"""
|
||||
Generator for realistic user agent strings.
|
||||
|
||||
Provides a variety of common user agents from different browsers
|
||||
and operating systems to help avoid detection.
|
||||
"""
|
||||
|
||||
# Common user agents for different browsers and OS combinations
|
||||
USER_AGENTS = [
|
||||
# Chrome on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
|
||||
# Chrome on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
||||
|
||||
# Chrome on Linux
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
||||
|
||||
# Firefox on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
|
||||
|
||||
# Firefox on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:122.0) Gecko/20100101 Firefox/122.0",
|
||||
|
||||
# Firefox on Linux
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
|
||||
|
||||
# Safari on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
||||
|
||||
# Edge on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
|
||||
|
||||
# Mobile Chrome (Android)
|
||||
"Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
|
||||
"Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
|
||||
|
||||
# Mobile Safari (iOS)
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1",
|
||||
"Mozilla/5.0 (iPad; CPU OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1",
|
||||
]
|
||||
|
||||
# Browser-specific user agents for when you need a specific browser
|
||||
CHROME_USER_AGENTS = [ua for ua in USER_AGENTS if "Chrome" in ua and "Edg" not in ua]
|
||||
FIREFOX_USER_AGENTS = [ua for ua in USER_AGENTS if "Firefox" in ua]
|
||||
SAFARI_USER_AGENTS = [ua for ua in USER_AGENTS if "Safari" in ua and "Chrome" not in ua]
|
||||
EDGE_USER_AGENTS = [ua for ua in USER_AGENTS if "Edg" in ua]
|
||||
|
||||
# Platform-specific user agents
|
||||
WINDOWS_USER_AGENTS = [ua for ua in USER_AGENTS if "Windows NT" in ua]
|
||||
MACOS_USER_AGENTS = [ua for ua in USER_AGENTS if "Macintosh" in ua]
|
||||
LINUX_USER_AGENTS = [ua for ua in USER_AGENTS if "Linux" in ua and "Android" not in ua]
|
||||
MOBILE_USER_AGENTS = [ua for ua in USER_AGENTS if "Mobile" in ua or "Android" in ua]
|
||||
|
||||
def __init__(self, seed: Optional[int] = None):
|
||||
"""
|
||||
Initialize the user agent generator.
|
||||
|
||||
Args:
|
||||
seed: Random seed for reproducible results (optional)
|
||||
"""
|
||||
if seed is not None:
|
||||
random.seed(seed)
|
||||
|
||||
def random(self) -> str:
|
||||
"""
|
||||
Get a random user agent string.
|
||||
|
||||
Returns:
|
||||
Random user agent string
|
||||
"""
|
||||
return random.choice(self.USER_AGENTS)
|
||||
|
||||
def random_browser(self, browser: str) -> str:
|
||||
"""
|
||||
Get a random user agent for a specific browser.
|
||||
|
||||
Args:
|
||||
browser: Browser name ('chrome', 'firefox', 'safari', 'edge')
|
||||
|
||||
Returns:
|
||||
Random user agent string for the specified browser
|
||||
|
||||
Raises:
|
||||
ValueError: If browser is not supported
|
||||
"""
|
||||
browser = browser.lower()
|
||||
if browser == 'chrome':
|
||||
return random.choice(self.CHROME_USER_AGENTS)
|
||||
elif browser == 'firefox':
|
||||
return random.choice(self.FIREFOX_USER_AGENTS)
|
||||
elif browser == 'safari':
|
||||
return random.choice(self.SAFARI_USER_AGENTS)
|
||||
elif browser == 'edge':
|
||||
return random.choice(self.EDGE_USER_AGENTS)
|
||||
else:
|
||||
raise ValueError(f"Unsupported browser: {browser}")
|
||||
|
||||
def random_platform(self, platform: str) -> str:
|
||||
"""
|
||||
Get a random user agent for a specific platform.
|
||||
|
||||
Args:
|
||||
platform: Platform name ('windows', 'macos', 'linux', 'mobile')
|
||||
|
||||
Returns:
|
||||
Random user agent string for the specified platform
|
||||
|
||||
Raises:
|
||||
ValueError: If platform is not supported
|
||||
"""
|
||||
platform = platform.lower()
|
||||
if platform == 'windows':
|
||||
return random.choice(self.WINDOWS_USER_AGENTS)
|
||||
elif platform in ('macos', 'mac'):
|
||||
return random.choice(self.MACOS_USER_AGENTS)
|
||||
elif platform == 'linux':
|
||||
return random.choice(self.LINUX_USER_AGENTS)
|
||||
elif platform == 'mobile':
|
||||
return random.choice(self.MOBILE_USER_AGENTS)
|
||||
else:
|
||||
raise ValueError(f"Unsupported platform: {platform}")
|
||||
|
||||
def add_user_agent(self, user_agent: str) -> None:
|
||||
"""
|
||||
Add a custom user agent to the list.
|
||||
|
||||
Args:
|
||||
user_agent: Custom user agent string to add
|
||||
"""
|
||||
if user_agent not in self.USER_AGENTS:
|
||||
self.USER_AGENTS.append(user_agent)
|
||||
|
||||
def get_all(self) -> List[str]:
|
||||
"""
|
||||
Get all available user agent strings.
|
||||
|
||||
Returns:
|
||||
List of all user agent strings
|
||||
"""
|
||||
return self.USER_AGENTS.copy()
|
||||
|
||||
|
||||
# Global instance for convenience
|
||||
_default_generator = UserAgentGenerator()
|
||||
|
||||
|
||||
def random_user_agent() -> str:
|
||||
"""
|
||||
Get a random user agent string using the default generator.
|
||||
|
||||
Returns:
|
||||
Random user agent string
|
||||
|
||||
Examples:
|
||||
>>> ua = random_user_agent()
|
||||
>>> "Mozilla" in ua
|
||||
True
|
||||
"""
|
||||
return _default_generator.random()
|
||||
|
||||
|
||||
def random_user_agent_browser(browser: str) -> str:
|
||||
"""
|
||||
Get a random user agent for a specific browser.
|
||||
|
||||
Args:
|
||||
browser: Browser name ('chrome', 'firefox', 'safari', 'edge')
|
||||
|
||||
Returns:
|
||||
Random user agent string for the specified browser
|
||||
"""
|
||||
return _default_generator.random_browser(browser)
|
||||
|
||||
|
||||
def random_user_agent_platform(platform: str) -> str:
|
||||
"""
|
||||
Get a random user agent for a specific platform.
|
||||
|
||||
Args:
|
||||
platform: Platform name ('windows', 'macos', 'linux', 'mobile')
|
||||
|
||||
Returns:
|
||||
Random user agent string for the specified platform
|
||||
"""
|
||||
return _default_generator.random_platform(platform)
|
||||
|
||||
|
||||
def set_user_agent_seed(seed: int) -> None:
|
||||
"""
|
||||
Set the random seed for user agent generation.
|
||||
|
||||
Args:
|
||||
seed: Random seed value
|
||||
"""
|
||||
global _default_generator
|
||||
_default_generator = UserAgentGenerator(seed)
|
||||
|
||||
|
||||
def add_custom_user_agent(user_agent: str) -> None:
|
||||
"""
|
||||
Add a custom user agent to the default generator.
|
||||
|
||||
Args:
|
||||
user_agent: Custom user agent string to add
|
||||
"""
|
||||
_default_generator.add_user_agent(user_agent)
|
||||
|
||||
|
||||
def get_all_user_agents() -> List[str]:
|
||||
"""
|
||||
Get all available user agent strings from the default generator.
|
||||
|
||||
Returns:
|
||||
List of all user agent strings
|
||||
"""
|
||||
return _default_generator.get_all()
|
||||
264
fastanime/libs/provider/scraping/utils.py
Normal file
264
fastanime/libs/provider/scraping/utils.py
Normal file
@@ -0,0 +1,264 @@
|
||||
"""
|
||||
Encoding and utility functions for web scraping.
|
||||
|
||||
Provides various encoding utilities including base-N encoding
|
||||
that was previously sourced from yt-dlp.
|
||||
"""
|
||||
|
||||
import string
|
||||
from typing import Union,Optional
|
||||
|
||||
|
||||
def encode_base_n(num: int, n: int, table: Optional[str] = None) -> str:
|
||||
"""
|
||||
Encode a number in base-n representation.
|
||||
|
||||
Args:
|
||||
num: The number to encode
|
||||
n: The base to use for encoding
|
||||
table: Custom character table (optional)
|
||||
|
||||
Returns:
|
||||
String representation of the number in base-n
|
||||
|
||||
Examples:
|
||||
>>> encode_base_n(255, 16)
|
||||
'ff'
|
||||
>>> encode_base_n(42, 36)
|
||||
'16'
|
||||
"""
|
||||
if table is None:
|
||||
# Default table: 0-9, a-z
|
||||
table = string.digits + string.ascii_lowercase
|
||||
|
||||
if not 2 <= n <= len(table):
|
||||
raise ValueError(f"Base must be between 2 and {len(table)}")
|
||||
|
||||
if num == 0:
|
||||
return table[0]
|
||||
|
||||
result = []
|
||||
is_negative = num < 0
|
||||
num = abs(num)
|
||||
|
||||
while num > 0:
|
||||
result.append(table[num % n])
|
||||
num //= n
|
||||
|
||||
if is_negative:
|
||||
result.append('-')
|
||||
|
||||
return ''.join(reversed(result))
|
||||
|
||||
|
||||
def decode_base_n(encoded: str, n: int, table: Optional[str] = None) -> int:
|
||||
"""
|
||||
Decode a base-n encoded string back to an integer.
|
||||
|
||||
Args:
|
||||
encoded: The base-n encoded string
|
||||
n: The base used for encoding
|
||||
table: Custom character table (optional)
|
||||
|
||||
Returns:
|
||||
The decoded integer
|
||||
|
||||
Examples:
|
||||
>>> decode_base_n('ff', 16)
|
||||
255
|
||||
>>> decode_base_n('16', 36)
|
||||
42
|
||||
"""
|
||||
if table is None:
|
||||
table = string.digits + string.ascii_lowercase
|
||||
|
||||
if not 2 <= n <= len(table):
|
||||
raise ValueError(f"Base must be between 2 and {len(table)}")
|
||||
|
||||
if not encoded:
|
||||
return 0
|
||||
|
||||
is_negative = encoded.startswith('-')
|
||||
if is_negative:
|
||||
encoded = encoded[1:]
|
||||
|
||||
result = 0
|
||||
for i, char in enumerate(reversed(encoded.lower())):
|
||||
if char not in table:
|
||||
raise ValueError(f"Invalid character '{char}' for base {n}")
|
||||
|
||||
digit_value = table.index(char)
|
||||
if digit_value >= n:
|
||||
raise ValueError(f"Invalid digit '{char}' for base {n}")
|
||||
|
||||
result += digit_value * (n ** i)
|
||||
|
||||
return -result if is_negative else result
|
||||
|
||||
|
||||
def url_encode(text: str, safe: str = '') -> str:
|
||||
"""
|
||||
URL encode a string.
|
||||
|
||||
Args:
|
||||
text: Text to encode
|
||||
safe: Characters that should not be encoded
|
||||
|
||||
Returns:
|
||||
URL encoded string
|
||||
"""
|
||||
import urllib.parse
|
||||
return urllib.parse.quote(text, safe=safe)
|
||||
|
||||
|
||||
def url_decode(text: str) -> str:
|
||||
"""
|
||||
URL decode a string.
|
||||
|
||||
Args:
|
||||
text: URL encoded text to decode
|
||||
|
||||
Returns:
|
||||
Decoded string
|
||||
"""
|
||||
import urllib.parse
|
||||
return urllib.parse.unquote(text)
|
||||
|
||||
|
||||
def html_unescape(text: str) -> str:
|
||||
"""
|
||||
Unescape HTML entities in text.
|
||||
|
||||
Args:
|
||||
text: Text containing HTML entities
|
||||
|
||||
Returns:
|
||||
Text with HTML entities unescaped
|
||||
|
||||
Examples:
|
||||
>>> html_unescape('"Hello" & <World>')
|
||||
'"Hello" & <World>'
|
||||
"""
|
||||
import html
|
||||
return html.unescape(text)
|
||||
|
||||
|
||||
def strip_tags(html_content: str) -> str:
|
||||
"""
|
||||
Remove all HTML tags from content, leaving only text.
|
||||
|
||||
Args:
|
||||
html_content: HTML content with tags
|
||||
|
||||
Returns:
|
||||
Plain text with tags removed
|
||||
|
||||
Examples:
|
||||
>>> strip_tags('<p>Hello <b>world</b>!</p>')
|
||||
'Hello world!'
|
||||
"""
|
||||
import re
|
||||
return re.sub(r'<[^>]+>', '', html_content)
|
||||
|
||||
|
||||
def normalize_whitespace(text: str) -> str:
|
||||
"""
|
||||
Normalize whitespace in text by collapsing multiple spaces and removing leading/trailing whitespace.
|
||||
|
||||
Args:
|
||||
text: Text to normalize
|
||||
|
||||
Returns:
|
||||
Text with normalized whitespace
|
||||
|
||||
Examples:
|
||||
>>> normalize_whitespace(' Hello world \\n\\t ')
|
||||
'Hello world'
|
||||
"""
|
||||
import re
|
||||
return re.sub(r'\s+', ' ', text.strip())
|
||||
|
||||
|
||||
def extract_domain(url: str) -> str:
|
||||
"""
|
||||
Extract domain from a URL.
|
||||
|
||||
Args:
|
||||
url: Full URL
|
||||
|
||||
Returns:
|
||||
Domain portion of the URL
|
||||
|
||||
Examples:
|
||||
>>> extract_domain('https://example.com/path?query=1')
|
||||
'example.com'
|
||||
"""
|
||||
import urllib.parse
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
return parsed.netloc
|
||||
|
||||
|
||||
def join_url(base: str, path: str) -> str:
|
||||
"""
|
||||
Join a base URL with a path.
|
||||
|
||||
Args:
|
||||
base: Base URL
|
||||
path: Path to join
|
||||
|
||||
Returns:
|
||||
Combined URL
|
||||
|
||||
Examples:
|
||||
>>> join_url('https://example.com', '/api/data')
|
||||
'https://example.com/api/data'
|
||||
"""
|
||||
import urllib.parse
|
||||
return urllib.parse.urljoin(base, path)
|
||||
|
||||
|
||||
def parse_query_string(query: str) -> dict:
|
||||
"""
|
||||
Parse a query string into a dictionary.
|
||||
|
||||
Args:
|
||||
query: Query string (with or without leading '?')
|
||||
|
||||
Returns:
|
||||
Dictionary of query parameters
|
||||
|
||||
Examples:
|
||||
>>> parse_query_string('?name=John&age=30')
|
||||
{'name': ['John'], 'age': ['30']}
|
||||
"""
|
||||
import urllib.parse
|
||||
if query.startswith('?'):
|
||||
query = query[1:]
|
||||
return urllib.parse.parse_qs(query)
|
||||
|
||||
|
||||
def build_query_string(params: dict) -> str:
|
||||
"""
|
||||
Build a query string from a dictionary of parameters.
|
||||
|
||||
Args:
|
||||
params: Dictionary of parameters
|
||||
|
||||
Returns:
|
||||
URL-encoded query string
|
||||
|
||||
Examples:
|
||||
>>> build_query_string({'name': 'John', 'age': 30})
|
||||
'name=John&age=30'
|
||||
"""
|
||||
import urllib.parse
|
||||
|
||||
# Handle both single values and lists
|
||||
normalized_params = {}
|
||||
for key, value in params.items():
|
||||
if isinstance(value, (list, tuple)):
|
||||
normalized_params[key] = value
|
||||
else:
|
||||
normalized_params[key] = [str(value)]
|
||||
|
||||
return urllib.parse.urlencode(normalized_params, doseq=True)
|
||||
Reference in New Issue
Block a user