feat: refactor provider imports and enhance HTML parsing utilities

This commit is contained in:
Benexl
2025-07-24 23:36:22 +03:00
parent 6017833605
commit 4bbfe221f2
8 changed files with 992 additions and 12 deletions

2
fa
View File

@@ -3,4 +3,4 @@ provider_type=$1
provider_name=$2
[ -z "$provider_type" ] && echo "Please specify provider type" && exit
[ -z "$provider_name" ] && echo "Please specify provider type" && exit
uv run python -m fastanime.libs.providers.${provider_type}.${provider_name}.provider
uv run python -m fastanime.libs.provider.${provider_type}.${provider_name}.provider

View File

@@ -2,7 +2,7 @@ import re
def animepahe_key_creator(c: int, a: int):
from yt_dlp.utils import encode_base_n
from ...scraping.utils import encode_base_n
if c < a:
val_a = ""
@@ -37,17 +37,18 @@ ENCODE_JS_REGEX = re.compile(r"'(.*?);',(\d+),(\d+),'(.*)'\.split")
def process_animepahe_embed_page(embed_page: str):
from yt_dlp.utils import get_element_text_and_html_by_tag
from ...scraping.html_parser import get_element_text_and_html_by_tag
encoded_js_string = ""
embed_page_content = embed_page
for _ in range(8):
text, html = get_element_text_and_html_by_tag("script", embed_page_content)
if not text:
if not text and html:
embed_page_content = re.sub(html, "", embed_page_content)
continue
encoded_js_string = text.strip()
break
if text:
encoded_js_string = text.strip()
break
if not encoded_js_string:
return
obsfucated_js_parameter_match = PARAMETERS_REGEX.search(encoded_js_string)

View File

@@ -106,8 +106,7 @@ class AnimePahe(BaseAnimeProvider):
@debug_provider
def episode_streams(self, params: EpisodeStreamsParams) -> Iterator[Server] | None:
# TODO: replace with custom implementations using default html parser or lxml
from yt_dlp.utils import (
from ...scraping.html_parser import (
extract_attributes,
get_element_by_id,
get_elements_html_by_class,
@@ -125,6 +124,9 @@ class AnimePahe(BaseAnimeProvider):
response.raise_for_status()
c = get_element_by_id("resolutionMenu", response.text)
if not c:
logger.error("Resolution menu not found in the response")
return
resolutionMenuItems = get_elements_html_by_class("dropdown-item", c)
res_dicts = [extract_attributes(item) for item in resolutionMenuItems]
quality = None
@@ -133,8 +135,9 @@ class AnimePahe(BaseAnimeProvider):
# TODO: better document the scraping process
for res_dict in res_dicts:
embed_url = res_dict["data-src"]
data_audio = "dub" if res_dict["data-audio"] == "eng" else "sub"
# the actual attributes are data attributes in the original html 'prefixed with data-'
embed_url = res_dict["src"]
data_audio = "dub" if res_dict["audio"] == "eng" else "sub"
if data_audio != params.translation_type:
continue
@@ -162,7 +165,7 @@ class AnimePahe(BaseAnimeProvider):
logger.error("failed to find juicy stream")
continue
juicy_stream = juicy_stream.group(1)
quality = res_dict["data-resolution"]
quality = res_dict["resolution"]
translation_type = data_audio
stream_link = juicy_stream

View File

@@ -36,7 +36,7 @@ def test_anime_provider(AnimeProvider: Type[BaseAnimeProvider]):
anime_provider = AnimeProvider(
Client(headers={"User-Agent": random_user_agent(), **AnimeProvider.HEADERS})
)
print(APP_ASCII_ART)
print(APP_ASCII_ART.read_text(encoding="utf-8"))
query = input("What anime would you like to stream: ")
search_results = anime_provider.search(SearchParams(query=query))
if not search_results:

View File

@@ -0,0 +1,474 @@
"""
HTML parsing utilities with optional lxml support.
This module provides comprehensive HTML parsing capabilities using either
Python's built-in html.parser or lxml for better performance when available.
"""
# TODO: Review and optimize the HTML parsing logic for better performance and flexibility.
# Consider adding more utility functions for common HTML manipulation tasks.
import logging
import re
from html.parser import HTMLParser as BaseHTMLParser
from typing import Dict, List, Optional, Tuple, Union
logger = logging.getLogger(__name__)
# Try to import lxml
HAS_LXML = False
try:
from lxml import etree, html as lxml_html
HAS_LXML = True
logger.debug("lxml is available and will be used for HTML parsing")
except ImportError:
logger.debug("lxml not available, falling back to html.parser")
class HTMLParserConfig:
"""Configuration for HTML parser selection."""
def __init__(self, use_lxml: Optional[bool] = None):
"""
Initialize parser configuration.
Args:
use_lxml: Force use of lxml (True), html.parser (False), or auto-detect (None)
"""
if use_lxml is None:
self.use_lxml = HAS_LXML
else:
self.use_lxml = use_lxml and HAS_LXML
if use_lxml and not HAS_LXML:
logger.warning("lxml requested but not available, falling back to html.parser")
class HTMLParser:
"""
Comprehensive HTML parser with optional lxml support.
Provides a unified interface for HTML parsing operations regardless
of the underlying parser implementation.
"""
def __init__(self, config: Optional[HTMLParserConfig] = None):
"""Initialize the HTML parser with configuration."""
self.config = config or HTMLParserConfig()
def parse(self, html_content: str) -> Union[etree._Element, 'ParsedHTML']:
"""
Parse HTML content and return a parsed tree.
Args:
html_content: Raw HTML string to parse
Returns:
Parsed HTML tree (lxml Element or custom ParsedHTML object)
"""
if self.config.use_lxml:
return self._parse_with_lxml(html_content)
else:
return self._parse_with_builtin(html_content)
def _parse_with_lxml(self, html_content: str) -> etree._Element:
"""Parse HTML using lxml."""
try:
# Use lxml's HTML parser which is more lenient
return lxml_html.fromstring(html_content)
except Exception as e:
logger.warning(f"lxml parsing failed: {e}, falling back to html.parser")
return self._parse_with_builtin(html_content)
def _parse_with_builtin(self, html_content: str) -> 'ParsedHTML':
"""Parse HTML using Python's built-in parser."""
parser = BuiltinHTMLParser()
parser.feed(html_content)
return ParsedHTML(parser.elements, html_content)
class BuiltinHTMLParser(BaseHTMLParser):
"""Enhanced HTML parser using Python's built-in capabilities."""
def __init__(self):
super().__init__()
self.elements = []
self.current_element = None
self.element_stack = []
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
"""Handle opening tags."""
element = {
'tag': tag,
'attrs': dict(attrs),
'text': '',
'children': [],
'start_pos': self.getpos(),
}
if self.element_stack:
self.element_stack[-1]['children'].append(element)
else:
self.elements.append(element)
self.element_stack.append(element)
def handle_endtag(self, tag: str):
"""Handle closing tags."""
if self.element_stack and self.element_stack[-1]['tag'] == tag:
element = self.element_stack.pop()
element['end_pos'] = self.getpos()
def handle_data(self, data: str):
"""Handle text content."""
if self.element_stack:
self.element_stack[-1]['text'] += data
class ParsedHTML:
"""Wrapper for parsed HTML using built-in parser."""
def __init__(self, elements: List[Dict], raw_html: str):
self.elements = elements
self.raw_html = raw_html
def find_by_id(self, element_id: str) -> Optional[Dict]:
"""Find element by ID."""
return self._find_recursive(self.elements, lambda el: el['attrs'].get('id') == element_id)
def find_by_class(self, class_name: str) -> List[Dict]:
"""Find elements by class name."""
results = []
self._find_all_recursive(
self.elements,
lambda el: class_name in el['attrs'].get('class', '').split(),
results
)
return results
def find_by_tag(self, tag_name: str) -> List[Dict]:
"""Find elements by tag name."""
results = []
self._find_all_recursive(
self.elements,
lambda el: el['tag'].lower() == tag_name.lower(),
results
)
return results
def _find_recursive(self, elements: List[Dict], condition) -> Optional[Dict]:
"""Recursively find first element matching condition."""
for element in elements:
if condition(element):
return element
result = self._find_recursive(element['children'], condition)
if result:
return result
return None
def _find_all_recursive(self, elements: List[Dict], condition, results: List[Dict]):
"""Recursively find all elements matching condition."""
for element in elements:
if condition(element):
results.append(element)
self._find_all_recursive(element['children'], condition, results)
# Global parser instance
_default_parser = HTMLParser()
def extract_attributes(html_element: str) -> Dict[str, str]:
"""
Extract attributes from an HTML element string.
Args:
html_element: HTML element as string (e.g., '<div class="test" id="main">')
Returns:
Dictionary of attribute name-value pairs
Examples:
>>> extract_attributes('<div class="test" id="main">')
{'class': 'test', 'id': 'main'}
"""
if not html_element:
return {}
# Use regex to extract attributes from HTML string
attr_pattern = r'(\w+)=(["\'])([^"\']*?)\2'
matches = re.findall(attr_pattern, html_element)
attributes = {}
for match in matches:
attr_name, _, attr_value = match
attributes[attr_name] = attr_value
# Handle attributes without quotes
unquoted_pattern = r'(\w+)=([^\s>]+)'
unquoted_matches = re.findall(unquoted_pattern, html_element)
for attr_name, attr_value in unquoted_matches:
if attr_name not in attributes:
attributes[attr_name] = attr_value
return attributes
def get_element_by_id(element_id: str, html_content: str) -> Optional[str]:
"""
Get HTML element by ID.
Args:
element_id: The ID attribute value to search for
html_content: HTML content to search in
Returns:
HTML string of the element or None if not found
Examples:
>>> html = '<div id="test">Content</div>'
>>> get_element_by_id("test", html)
'<div id="test">Content</div>'
"""
parsed = _default_parser.parse(html_content)
if _default_parser.config.use_lxml:
try:
element = parsed.xpath(f'//*[@id="{element_id}"]')
if element:
return etree.tostring(element[0], encoding='unicode', method='html')
except Exception as e:
logger.warning(f"lxml XPath search failed: {e}")
return None
else:
element = parsed.find_by_id(element_id)
if element:
return _element_to_html(element, html_content)
return None
def get_element_by_tag(tag_name: str, html_content: str) -> Optional[str]:
"""
Get first HTML element by tag name.
Args:
tag_name: The tag name to search for
html_content: HTML content to search in
Returns:
HTML string of the element or None if not found
"""
parsed = _default_parser.parse(html_content)
if _default_parser.config.use_lxml:
try:
elements = parsed.xpath(f'//{tag_name}')
if elements:
return etree.tostring(elements[0], encoding='unicode', method='html')
except Exception as e:
logger.warning(f"lxml XPath search failed: {e}")
return None
else:
elements = parsed.find_by_tag(tag_name)
if elements:
return _element_to_html(elements[0], html_content)
return None
def get_element_by_class(class_name: str, html_content: str) -> Optional[str]:
"""
Get first HTML element by class name.
Args:
class_name: The class name to search for
html_content: HTML content to search in
Returns:
HTML string of the element or None if not found
"""
parsed = _default_parser.parse(html_content)
if _default_parser.config.use_lxml:
try:
elements = parsed.xpath(f'//*[contains(@class, "{class_name}")]')
if elements:
return etree.tostring(elements[0], encoding='unicode', method='html')
except Exception as e:
logger.warning(f"lxml XPath search failed: {e}")
return None
else:
elements = parsed.find_by_class(class_name)
if elements:
return _element_to_html(elements[0], html_content)
return None
def get_elements_by_tag(tag_name: str, html_content: str) -> List[str]:
"""
Get all HTML elements by tag name.
Args:
tag_name: The tag name to search for
html_content: HTML content to search in
Returns:
List of HTML strings for matching elements
"""
parsed = _default_parser.parse(html_content)
results = []
if _default_parser.config.use_lxml:
try:
elements = parsed.xpath(f'//{tag_name}')
for element in elements:
results.append(etree.tostring(element, encoding='unicode', method='html'))
except Exception as e:
logger.warning(f"lxml XPath search failed: {e}")
else:
elements = parsed.find_by_tag(tag_name)
for element in elements:
results.append(_element_to_html(element, html_content))
return results
def get_elements_by_class(class_name: str, html_content: str) -> List[str]:
"""
Get all HTML elements by class name.
Args:
class_name: The class name to search for
html_content: HTML content to search in
Returns:
List of HTML strings for matching elements
"""
parsed = _default_parser.parse(html_content)
results = []
if _default_parser.config.use_lxml:
try:
elements = parsed.xpath(f'//*[contains(@class, "{class_name}")]')
for element in elements:
results.append(etree.tostring(element, encoding='unicode', method='html'))
except Exception as e:
logger.warning(f"lxml XPath search failed: {e}")
else:
elements = parsed.find_by_class(class_name)
for element in elements:
results.append(_element_to_html(element, html_content))
return results
def get_elements_html_by_class(class_name: str, html_content: str) -> List[str]:
"""
Get HTML strings of elements by class name.
This is an alias for get_elements_by_class for yt-dlp compatibility.
Args:
class_name: The class name to search for
html_content: HTML content to search in
Returns:
List of HTML strings for matching elements
"""
return get_elements_by_class(class_name, html_content)
def get_element_text_and_html_by_tag(tag_name: str, html_content: str) -> Tuple[Optional[str], Optional[str]]:
"""
Get both text content and HTML of first element by tag name.
Args:
tag_name: The tag name to search for
html_content: HTML content to search in
Returns:
Tuple of (text_content, html_string) or (None, None) if not found
Examples:
>>> html = '<script>alert("test");</script>'
>>> get_element_text_and_html_by_tag("script", html)
('alert("test");', '<script>alert("test");</script>')
"""
parsed = _default_parser.parse(html_content)
if _default_parser.config.use_lxml:
try:
elements = parsed.xpath(f'//{tag_name}')
if elements:
element = elements[0]
text = element.text_content() if hasattr(element, 'text_content') else (element.text or '')
html_str = etree.tostring(element, encoding='unicode', method='html')
return text, html_str
except Exception as e:
logger.warning(f"lxml XPath search failed: {e}")
return None, None
else:
elements = parsed.find_by_tag(tag_name)
if elements:
element = elements[0]
text = _extract_text_content(element)
html_str = _element_to_html(element, html_content)
return text, html_str
return None, None
def _element_to_html(element: Dict, original_html: str) -> str:
"""
Convert parsed element back to HTML string.
This is a simplified implementation that reconstructs HTML from parsed data.
For production use, consider using lxml for better accuracy.
"""
if not element:
return ""
# Build opening tag
tag = element['tag']
attrs = element.get('attrs', {})
attr_str = ' '.join(f'{k}="{v}"' for k, v in attrs.items() if v is not None)
if attr_str:
opening_tag = f"<{tag} {attr_str}>"
else:
opening_tag = f"<{tag}>"
# Add text content
text = element.get('text', '')
# Add children
children_html = ""
for child in element.get('children', []):
children_html += _element_to_html(child, original_html)
# Build closing tag
closing_tag = f"</{tag}>"
return f"{opening_tag}{text}{children_html}{closing_tag}"
def _extract_text_content(element: Dict) -> str:
"""Extract all text content from element and its children."""
text = element.get('text', '')
for child in element.get('children', []):
text += _extract_text_content(child)
return text
def configure_parser(use_lxml: Optional[bool] = None) -> None:
"""
Configure the global HTML parser.
Args:
use_lxml: Force use of lxml (True), html.parser (False), or auto-detect (None)
"""
global _default_parser
_default_parser = HTMLParser(HTMLParserConfig(use_lxml))
logger.info(f"HTML parser configured: {'lxml' if _default_parser.config.use_lxml else 'html.parser'}")

View File

@@ -0,0 +1,238 @@
"""
User agent utilities for web scraping.
Provides functionality to generate random user agent strings
to avoid detection and blocking by websites.
"""
import random
from typing import List, Optional
class UserAgentGenerator:
"""
Generator for realistic user agent strings.
Provides a variety of common user agents from different browsers
and operating systems to help avoid detection.
"""
# Common user agents for different browsers and OS combinations
USER_AGENTS = [
# Chrome on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# Chrome on macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
# Chrome on Linux
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
# Firefox on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
# Firefox on macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:122.0) Gecko/20100101 Firefox/122.0",
# Firefox on Linux
"Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
# Safari on macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
# Edge on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
# Mobile Chrome (Android)
"Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
# Mobile Safari (iOS)
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPad; CPU OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1",
]
# Browser-specific user agents for when you need a specific browser
CHROME_USER_AGENTS = [ua for ua in USER_AGENTS if "Chrome" in ua and "Edg" not in ua]
FIREFOX_USER_AGENTS = [ua for ua in USER_AGENTS if "Firefox" in ua]
SAFARI_USER_AGENTS = [ua for ua in USER_AGENTS if "Safari" in ua and "Chrome" not in ua]
EDGE_USER_AGENTS = [ua for ua in USER_AGENTS if "Edg" in ua]
# Platform-specific user agents
WINDOWS_USER_AGENTS = [ua for ua in USER_AGENTS if "Windows NT" in ua]
MACOS_USER_AGENTS = [ua for ua in USER_AGENTS if "Macintosh" in ua]
LINUX_USER_AGENTS = [ua for ua in USER_AGENTS if "Linux" in ua and "Android" not in ua]
MOBILE_USER_AGENTS = [ua for ua in USER_AGENTS if "Mobile" in ua or "Android" in ua]
def __init__(self, seed: Optional[int] = None):
"""
Initialize the user agent generator.
Args:
seed: Random seed for reproducible results (optional)
"""
if seed is not None:
random.seed(seed)
def random(self) -> str:
"""
Get a random user agent string.
Returns:
Random user agent string
"""
return random.choice(self.USER_AGENTS)
def random_browser(self, browser: str) -> str:
"""
Get a random user agent for a specific browser.
Args:
browser: Browser name ('chrome', 'firefox', 'safari', 'edge')
Returns:
Random user agent string for the specified browser
Raises:
ValueError: If browser is not supported
"""
browser = browser.lower()
if browser == 'chrome':
return random.choice(self.CHROME_USER_AGENTS)
elif browser == 'firefox':
return random.choice(self.FIREFOX_USER_AGENTS)
elif browser == 'safari':
return random.choice(self.SAFARI_USER_AGENTS)
elif browser == 'edge':
return random.choice(self.EDGE_USER_AGENTS)
else:
raise ValueError(f"Unsupported browser: {browser}")
def random_platform(self, platform: str) -> str:
"""
Get a random user agent for a specific platform.
Args:
platform: Platform name ('windows', 'macos', 'linux', 'mobile')
Returns:
Random user agent string for the specified platform
Raises:
ValueError: If platform is not supported
"""
platform = platform.lower()
if platform == 'windows':
return random.choice(self.WINDOWS_USER_AGENTS)
elif platform in ('macos', 'mac'):
return random.choice(self.MACOS_USER_AGENTS)
elif platform == 'linux':
return random.choice(self.LINUX_USER_AGENTS)
elif platform == 'mobile':
return random.choice(self.MOBILE_USER_AGENTS)
else:
raise ValueError(f"Unsupported platform: {platform}")
def add_user_agent(self, user_agent: str) -> None:
"""
Add a custom user agent to the list.
Args:
user_agent: Custom user agent string to add
"""
if user_agent not in self.USER_AGENTS:
self.USER_AGENTS.append(user_agent)
def get_all(self) -> List[str]:
"""
Get all available user agent strings.
Returns:
List of all user agent strings
"""
return self.USER_AGENTS.copy()
# Global instance for convenience
_default_generator = UserAgentGenerator()
def random_user_agent() -> str:
"""
Get a random user agent string using the default generator.
Returns:
Random user agent string
Examples:
>>> ua = random_user_agent()
>>> "Mozilla" in ua
True
"""
return _default_generator.random()
def random_user_agent_browser(browser: str) -> str:
"""
Get a random user agent for a specific browser.
Args:
browser: Browser name ('chrome', 'firefox', 'safari', 'edge')
Returns:
Random user agent string for the specified browser
"""
return _default_generator.random_browser(browser)
def random_user_agent_platform(platform: str) -> str:
"""
Get a random user agent for a specific platform.
Args:
platform: Platform name ('windows', 'macos', 'linux', 'mobile')
Returns:
Random user agent string for the specified platform
"""
return _default_generator.random_platform(platform)
def set_user_agent_seed(seed: int) -> None:
"""
Set the random seed for user agent generation.
Args:
seed: Random seed value
"""
global _default_generator
_default_generator = UserAgentGenerator(seed)
def add_custom_user_agent(user_agent: str) -> None:
"""
Add a custom user agent to the default generator.
Args:
user_agent: Custom user agent string to add
"""
_default_generator.add_user_agent(user_agent)
def get_all_user_agents() -> List[str]:
"""
Get all available user agent strings from the default generator.
Returns:
List of all user agent strings
"""
return _default_generator.get_all()

View File

@@ -0,0 +1,264 @@
"""
Encoding and utility functions for web scraping.
Provides various encoding utilities including base-N encoding
that was previously sourced from yt-dlp.
"""
import string
from typing import Union,Optional
def encode_base_n(num: int, n: int, table: Optional[str] = None) -> str:
"""
Encode a number in base-n representation.
Args:
num: The number to encode
n: The base to use for encoding
table: Custom character table (optional)
Returns:
String representation of the number in base-n
Examples:
>>> encode_base_n(255, 16)
'ff'
>>> encode_base_n(42, 36)
'16'
"""
if table is None:
# Default table: 0-9, a-z
table = string.digits + string.ascii_lowercase
if not 2 <= n <= len(table):
raise ValueError(f"Base must be between 2 and {len(table)}")
if num == 0:
return table[0]
result = []
is_negative = num < 0
num = abs(num)
while num > 0:
result.append(table[num % n])
num //= n
if is_negative:
result.append('-')
return ''.join(reversed(result))
def decode_base_n(encoded: str, n: int, table: Optional[str] = None) -> int:
"""
Decode a base-n encoded string back to an integer.
Args:
encoded: The base-n encoded string
n: The base used for encoding
table: Custom character table (optional)
Returns:
The decoded integer
Examples:
>>> decode_base_n('ff', 16)
255
>>> decode_base_n('16', 36)
42
"""
if table is None:
table = string.digits + string.ascii_lowercase
if not 2 <= n <= len(table):
raise ValueError(f"Base must be between 2 and {len(table)}")
if not encoded:
return 0
is_negative = encoded.startswith('-')
if is_negative:
encoded = encoded[1:]
result = 0
for i, char in enumerate(reversed(encoded.lower())):
if char not in table:
raise ValueError(f"Invalid character '{char}' for base {n}")
digit_value = table.index(char)
if digit_value >= n:
raise ValueError(f"Invalid digit '{char}' for base {n}")
result += digit_value * (n ** i)
return -result if is_negative else result
def url_encode(text: str, safe: str = '') -> str:
"""
URL encode a string.
Args:
text: Text to encode
safe: Characters that should not be encoded
Returns:
URL encoded string
"""
import urllib.parse
return urllib.parse.quote(text, safe=safe)
def url_decode(text: str) -> str:
"""
URL decode a string.
Args:
text: URL encoded text to decode
Returns:
Decoded string
"""
import urllib.parse
return urllib.parse.unquote(text)
def html_unescape(text: str) -> str:
"""
Unescape HTML entities in text.
Args:
text: Text containing HTML entities
Returns:
Text with HTML entities unescaped
Examples:
>>> html_unescape('&quot;Hello&quot; &amp; &lt;World&gt;')
'"Hello" & <World>'
"""
import html
return html.unescape(text)
def strip_tags(html_content: str) -> str:
"""
Remove all HTML tags from content, leaving only text.
Args:
html_content: HTML content with tags
Returns:
Plain text with tags removed
Examples:
>>> strip_tags('<p>Hello <b>world</b>!</p>')
'Hello world!'
"""
import re
return re.sub(r'<[^>]+>', '', html_content)
def normalize_whitespace(text: str) -> str:
"""
Normalize whitespace in text by collapsing multiple spaces and removing leading/trailing whitespace.
Args:
text: Text to normalize
Returns:
Text with normalized whitespace
Examples:
>>> normalize_whitespace(' Hello world \\n\\t ')
'Hello world'
"""
import re
return re.sub(r'\s+', ' ', text.strip())
def extract_domain(url: str) -> str:
"""
Extract domain from a URL.
Args:
url: Full URL
Returns:
Domain portion of the URL
Examples:
>>> extract_domain('https://example.com/path?query=1')
'example.com'
"""
import urllib.parse
parsed = urllib.parse.urlparse(url)
return parsed.netloc
def join_url(base: str, path: str) -> str:
"""
Join a base URL with a path.
Args:
base: Base URL
path: Path to join
Returns:
Combined URL
Examples:
>>> join_url('https://example.com', '/api/data')
'https://example.com/api/data'
"""
import urllib.parse
return urllib.parse.urljoin(base, path)
def parse_query_string(query: str) -> dict:
"""
Parse a query string into a dictionary.
Args:
query: Query string (with or without leading '?')
Returns:
Dictionary of query parameters
Examples:
>>> parse_query_string('?name=John&age=30')
{'name': ['John'], 'age': ['30']}
"""
import urllib.parse
if query.startswith('?'):
query = query[1:]
return urllib.parse.parse_qs(query)
def build_query_string(params: dict) -> str:
"""
Build a query string from a dictionary of parameters.
Args:
params: Dictionary of parameters
Returns:
URL-encoded query string
Examples:
>>> build_query_string({'name': 'John', 'age': 30})
'name=John&age=30'
"""
import urllib.parse
# Handle both single values and lists
normalized_params = {}
for key, value in params.items():
if isinstance(value, (list, tuple)):
normalized_params[key] = value
else:
normalized_params[key] = [str(value)]
return urllib.parse.urlencode(normalized_params, doseq=True)