feat(proxy): add proxy configuration support to CrawlerRunConfig

Add proxy_config parameter to CrawlerRunConfig to support dynamic proxy configuration per crawl request. This enables users to specify different proxy settings for each crawl operation without modifying the browser config.

- Added proxy_config parameter to CrawlerRunConfig
- Updated BrowserManager to apply proxy settings from CrawlerRunConfig
- Updated proxy-security documentation with new usage examples
This commit is contained in:
UncleCode
2025-01-20 22:14:05 +08:00
parent 2cec527a22
commit 9247877037
4 changed files with 43 additions and 13 deletions

View File

@@ -1,3 +1,7 @@
### [Added] 2025-01-20
- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request
- Updated documentation with examples for using proxy configuration in crawl operations
### [Added] 2025-01-20 ### [Added] 2025-01-20
- New LLM-powered schema generation utility for JsonElementExtractionStrategy - New LLM-powered schema generation utility for JsonElementExtractionStrategy
- Support for automatic CSS and XPath schema generation using OpenAI or Ollama - Support for automatic CSS and XPath schema generation using OpenAI or Ollama

View File

@@ -270,6 +270,8 @@ class CrawlerRunConfig:
Default: "lxml". Default: "lxml".
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use. scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
Default: WebScrapingStrategy. Default: WebScrapingStrategy.
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.
# Caching Parameters # Caching Parameters
cache_mode (CacheMode or None): Defines how caching is handled. cache_mode (CacheMode or None): Defines how caching is handled.
@@ -389,6 +391,7 @@ class CrawlerRunConfig:
prettiify: bool = False, prettiify: bool = False,
parser_type: str = "lxml", parser_type: str = "lxml",
scraping_strategy: ContentScrapingStrategy = None, scraping_strategy: ContentScrapingStrategy = None,
proxy_config: dict = None,
# SSL Parameters # SSL Parameters
fetch_ssl_certificate: bool = False, fetch_ssl_certificate: bool = False,
# Caching Parameters # Caching Parameters
@@ -457,6 +460,7 @@ class CrawlerRunConfig:
self.prettiify = prettiify self.prettiify = prettiify
self.parser_type = parser_type self.parser_type = parser_type
self.scraping_strategy = scraping_strategy or WebScrapingStrategy() self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
self.proxy_config = proxy_config
# SSL Parameters # SSL Parameters
self.fetch_ssl_certificate = fetch_ssl_certificate self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -553,6 +557,7 @@ class CrawlerRunConfig:
prettiify=kwargs.get("prettiify", False), prettiify=kwargs.get("prettiify", False),
parser_type=kwargs.get("parser_type", "lxml"), parser_type=kwargs.get("parser_type", "lxml"),
scraping_strategy=kwargs.get("scraping_strategy"), scraping_strategy=kwargs.get("scraping_strategy"),
proxy_config=kwargs.get("proxy_config"),
# SSL Parameters # SSL Parameters
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
# Caching Parameters # Caching Parameters
@@ -631,6 +636,7 @@ class CrawlerRunConfig:
"prettiify": self.prettiify, "prettiify": self.prettiify,
"parser_type": self.parser_type, "parser_type": self.parser_type,
"scraping_strategy": self.scraping_strategy, "scraping_strategy": self.scraping_strategy,
"proxy_config": self.proxy_config,
"fetch_ssl_certificate": self.fetch_ssl_certificate, "fetch_ssl_certificate": self.fetch_ssl_certificate,
"cache_mode": self.cache_mode, "cache_mode": self.cache_mode,
"session_id": self.session_id, "session_id": self.session_id,

View File

@@ -545,7 +545,7 @@ class BrowserManager:
): ):
await context.add_init_script(load_js_script("navigator_overrider")) await context.add_init_script(load_js_script("navigator_overrider"))
async def create_browser_context(self): async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
""" """
Creates and returns a new browser context with configured settings. Creates and returns a new browser context with configured settings.
Applies text-only mode settings if text_mode is enabled in config. Applies text-only mode settings if text_mode is enabled in config.
@@ -628,6 +628,16 @@ class BrowserManager:
"java_script_enabled": self.config.java_script_enabled, "java_script_enabled": self.config.java_script_enabled,
} }
if crawlerRunConfig:
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
if crawlerRunConfig.proxy_config:
proxy_settings = {
"server": crawlerRunConfig.proxy_config.get("server"),
"username": crawlerRunConfig.proxy_config.get("username"),
"password": crawlerRunConfig.proxy_config.get("password"),
}
context_settings["proxy"] = proxy_settings
if self.config.text_mode: if self.config.text_mode:
text_mode_settings = { text_mode_settings = {
"has_touch": False, "has_touch": False,
@@ -710,7 +720,7 @@ class BrowserManager:
context = self.contexts_by_config[config_signature] context = self.contexts_by_config[config_signature]
else: else:
# Create and setup a new context # Create and setup a new context
context = await self.create_browser_context() context = await self.create_browser_context(crawlerRunConfig)
await self.setup_context(context, crawlerRunConfig) await self.setup_context(context, crawlerRunConfig)
self.contexts_by_config[config_signature] = context self.contexts_by_config[config_signature] = context

View File

@@ -36,23 +36,33 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com") result = await crawler.arun(url="https://example.com")
``` ```
## Rotating Proxies Here's the corrected documentation:
Example using a proxy rotation service and updating `BrowserConfig` dynamically: ## Rotating Proxies [COMING SOON]
Example using a proxy rotation service dynamically:
```python ```python
from crawl4ai.async_configs import BrowserConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
async def get_next_proxy(): async def get_next_proxy():
# Your proxy rotation logic here # Your proxy rotation logic here
return {"server": "http://next.proxy.com:8080"} return {"server": "http://next.proxy.com:8080"}
async def main():
browser_config = BrowserConfig() browser_config = BrowserConfig()
run_config = CrawlerRunConfig()
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
# Update proxy for each request # For each URL, create a new run config with different proxy
for url in urls: for url in urls:
proxy = await get_next_proxy() proxy = await get_next_proxy()
browser_config.proxy_config = proxy # Clone the config and update proxy - this creates a new browser context
result = await crawler.arun(url=url, config=browser_config) current_config = run_config.clone(proxy_config=proxy)
result = await crawler.arun(url=url, config=current_config)
if __name__ == "__main__":
import asyncio
asyncio.run(main())
``` ```