refactor(proxy): consolidate proxy configuration handling

Moves ProxyConfig from configs/ directory into proxy_strategy.py to improve code organization and reduce fragmentation. Updates all imports and type hints to reflect the new location.

Key changes:
- Moved ProxyConfig class from configs/proxy_config.py to proxy_strategy.py
- Updated type hints in async_configs.py to support ProxyConfig
- Fixed proxy configuration handling in browser_manager.py
- Updated documentation and examples to use new import path

BREAKING CHANGE: ProxyConfig import path has changed from crawl4ai.configs to crawl4ai.proxy_strategy
This commit is contained in:
UncleCode
2025-03-07 23:14:11 +08:00
parent a68cbb232b
commit 4aeb7ef9ad
11 changed files with 311 additions and 129 deletions

View File

@@ -26,6 +26,8 @@ import inspect
from typing import Any, Dict, Optional
from enum import Enum
from .proxy_strategy import ProxyConfig
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
"""
@@ -180,7 +182,7 @@ class BrowserConfig:
is "chromium". Default: "chromium".
proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
Default: None.
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.
viewport_width (int): Default viewport width for pages. Default: 1080.
viewport_height (int): Default viewport height for pages. Default: 600.
@@ -225,7 +227,7 @@ class BrowserConfig:
chrome_channel: str = "chromium",
channel: str = "chromium",
proxy: str = None,
proxy_config: dict = None,
proxy_config: Union[ProxyConfig, dict, None] = None,
viewport_width: int = 1080,
viewport_height: int = 600,
viewport: dict = None,
@@ -315,7 +317,7 @@ class BrowserConfig:
chrome_channel=kwargs.get("chrome_channel", "chromium"),
channel=kwargs.get("channel", "chromium"),
proxy=kwargs.get("proxy"),
proxy_config=kwargs.get("proxy_config"),
proxy_config=kwargs.get("proxy_config", None),
viewport_width=kwargs.get("viewport_width", 1080),
viewport_height=kwargs.get("viewport_height", 600),
accept_downloads=kwargs.get("accept_downloads", False),
@@ -515,7 +517,7 @@ class CrawlerRunConfig():
Default: "lxml".
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
Default: WebScrapingStrategy.
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.
# SSL Parameters
@@ -656,7 +658,7 @@ class CrawlerRunConfig():
prettiify: bool = False,
parser_type: str = "lxml",
scraping_strategy: ContentScrapingStrategy = None,
proxy_config: dict = None,
proxy_config: Union[ProxyConfig, dict, None] = None,
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
# SSL Parameters
fetch_ssl_certificate: bool = False,

View File

@@ -767,6 +767,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Handle wait_for condition
# Todo: Decide how to handle this
if not config.wait_for and config.css_selector and False:
# if not config.wait_for and config.css_selector:
config.wait_for = f"css:{config.css_selector}"
if config.wait_for:
@@ -806,8 +807,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if config.remove_overlay_elements:
await self.remove_overlay_elements(page)
# Get final HTML content
html = await page.content()
if config.css_selector:
try:
# Handle comma-separated selectors by splitting them
selectors = [s.strip() for s in config.css_selector.split(',')]
html_parts = []
for selector in selectors:
try:
content = await page.evaluate(f"document.querySelector('{selector}')?.outerHTML || ''")
html_parts.append(content)
except Error as e:
print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
# Wrap in a div to create a valid HTML structure
html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"
except Error as e:
raise RuntimeError(f"Failed to extract HTML content: {str(e)}")
else:
html = await page.content()
# # Get final HTML content
# html = await page.content()
await self.execute_hook(
"before_return_html", page=page, html=html, context=context, config=config
)

View File

@@ -531,9 +531,9 @@ class BrowserManager:
ProxySettings(server=self.config.proxy)
if self.config.proxy
else ProxySettings(
server=self.config.proxy_config.get("server"),
username=self.config.proxy_config.get("username"),
password=self.config.proxy_config.get("password"),
server=self.config.proxy_config.server,
username=self.config.proxy_config.username,
password=self.config.proxy_config.password,
)
)
browser_args["proxy"] = proxy_settings

View File

@@ -1,2 +0,0 @@
from .proxy_config import ProxyConfig
__all__ = ["ProxyConfig"]

View File

@@ -1,113 +0,0 @@
import os
from typing import Dict, List, Optional
class ProxyConfig:
def __init__(
self,
server: str,
username: Optional[str] = None,
password: Optional[str] = None,
ip: Optional[str] = None,
):
"""Configuration class for a single proxy.
Args:
server: Proxy server URL (e.g., "http://127.0.0.1:8080")
username: Optional username for proxy authentication
password: Optional password for proxy authentication
ip: Optional IP address for verification purposes
"""
self.server = server
self.username = username
self.password = password
# Extract IP from server if not explicitly provided
self.ip = ip or self._extract_ip_from_server()
def _extract_ip_from_server(self) -> Optional[str]:
"""Extract IP address from server URL."""
try:
# Simple extraction assuming http://ip:port format
if "://" in self.server:
parts = self.server.split("://")[1].split(":")
return parts[0]
else:
parts = self.server.split(":")
return parts[0]
except Exception:
return None
@staticmethod
def from_string(proxy_str: str) -> "ProxyConfig":
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
parts = proxy_str.split(":")
if len(parts) == 4: # ip:port:username:password
ip, port, username, password = parts
return ProxyConfig(
server=f"http://{ip}:{port}",
username=username,
password=password,
ip=ip
)
elif len(parts) == 2: # ip:port only
ip, port = parts
return ProxyConfig(
server=f"http://{ip}:{port}",
ip=ip
)
else:
raise ValueError(f"Invalid proxy string format: {proxy_str}")
@staticmethod
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
"""Create a ProxyConfig from a dictionary."""
return ProxyConfig(
server=proxy_dict.get("server"),
username=proxy_dict.get("username"),
password=proxy_dict.get("password"),
ip=proxy_dict.get("ip")
)
@staticmethod
def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
"""Load proxies from environment variable.
Args:
env_var: Name of environment variable containing comma-separated proxy strings
Returns:
List of ProxyConfig objects
"""
proxies = []
try:
proxy_list = os.getenv(env_var, "").split(",")
for proxy in proxy_list:
if not proxy:
continue
proxies.append(ProxyConfig.from_string(proxy))
except Exception as e:
print(f"Error loading proxies from environment: {e}")
return proxies
def to_dict(self) -> Dict:
"""Convert to dictionary representation."""
return {
"server": self.server,
"username": self.username,
"password": self.password,
"ip": self.ip
}
def clone(self, **kwargs) -> "ProxyConfig":
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
ProxyConfig: A new instance with the specified updates
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return ProxyConfig.from_dict(config_dict)

View File

@@ -742,7 +742,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
for element in body.select(excluded_selector):
element.extract()
if css_selector:
if False and css_selector:
selected_elements = body.select(css_selector)
if not selected_elements:
return {
@@ -848,6 +848,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return {
# **markdown_content,
"scraped_html": html,
"cleaned_html": cleaned_html,
"success": success,
"media": media,

View File

@@ -1,8 +1,119 @@
from typing import List, Dict, Optional
from abc import ABC, abstractmethod
from itertools import cycle
import os
class ProxyConfig:
def __init__(
self,
server: str,
username: Optional[str] = None,
password: Optional[str] = None,
ip: Optional[str] = None,
):
"""Configuration class for a single proxy.
Args:
server: Proxy server URL (e.g., "http://127.0.0.1:8080")
username: Optional username for proxy authentication
password: Optional password for proxy authentication
ip: Optional IP address for verification purposes
"""
self.server = server
self.username = username
self.password = password
# Extract IP from server if not explicitly provided
self.ip = ip or self._extract_ip_from_server()
def _extract_ip_from_server(self) -> Optional[str]:
"""Extract IP address from server URL."""
try:
# Simple extraction assuming http://ip:port format
if "://" in self.server:
parts = self.server.split("://")[1].split(":")
return parts[0]
else:
parts = self.server.split(":")
return parts[0]
except Exception:
return None
@staticmethod
def from_string(proxy_str: str) -> "ProxyConfig":
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
parts = proxy_str.split(":")
if len(parts) == 4: # ip:port:username:password
ip, port, username, password = parts
return ProxyConfig(
server=f"http://{ip}:{port}",
username=username,
password=password,
ip=ip
)
elif len(parts) == 2: # ip:port only
ip, port = parts
return ProxyConfig(
server=f"http://{ip}:{port}",
ip=ip
)
else:
raise ValueError(f"Invalid proxy string format: {proxy_str}")
@staticmethod
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
"""Create a ProxyConfig from a dictionary."""
return ProxyConfig(
server=proxy_dict.get("server"),
username=proxy_dict.get("username"),
password=proxy_dict.get("password"),
ip=proxy_dict.get("ip")
)
@staticmethod
def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
"""Load proxies from environment variable.
Args:
env_var: Name of environment variable containing comma-separated proxy strings
Returns:
List of ProxyConfig objects
"""
proxies = []
try:
proxy_list = os.getenv(env_var, "").split(",")
for proxy in proxy_list:
if not proxy:
continue
proxies.append(ProxyConfig.from_string(proxy))
except Exception as e:
print(f"Error loading proxies from environment: {e}")
return proxies
def to_dict(self) -> Dict:
"""Convert to dictionary representation."""
return {
"server": self.server,
"username": self.username,
"password": self.password,
"ip": self.ip
}
def clone(self, **kwargs) -> "ProxyConfig":
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
ProxyConfig: A new instance with the specified updates
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return ProxyConfig.from_dict(config_dict)
from crawl4ai.configs import ProxyConfig
class ProxyRotationStrategy(ABC):
"""Base abstract class for proxy rotation strategies"""