feat(proxy): add proxy configuration support to CrawlerRunConfig

Add proxy_config parameter to CrawlerRunConfig to support dynamic proxy configuration per crawl request. This enables users to specify different proxy settings for each crawl operation without modifying the browser config.

- Added proxy_config parameter to CrawlerRunConfig
- Updated BrowserManager to apply proxy settings from CrawlerRunConfig
- Updated proxy-security documentation with new usage examples
This commit is contained in:
UncleCode
2025-01-20 22:14:05 +08:00
parent 2cec527a22
commit 9247877037
4 changed files with 43 additions and 13 deletions

View File

@@ -1,3 +1,7 @@
### [Added] 2025-01-20
- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request
- Updated documentation with examples for using proxy configuration in crawl operations
### [Added] 2025-01-20
- New LLM-powered schema generation utility for JsonElementExtractionStrategy
- Support for automatic CSS and XPath schema generation using OpenAI or Ollama

View File

@@ -270,6 +270,8 @@ class CrawlerRunConfig:
Default: "lxml".
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
Default: WebScrapingStrategy.
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.
# Caching Parameters
cache_mode (CacheMode or None): Defines how caching is handled.
@@ -389,6 +391,7 @@ class CrawlerRunConfig:
prettiify: bool = False,
parser_type: str = "lxml",
scraping_strategy: ContentScrapingStrategy = None,
proxy_config: dict = None,
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
@@ -457,6 +460,7 @@ class CrawlerRunConfig:
self.prettiify = prettiify
self.parser_type = parser_type
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
self.proxy_config = proxy_config
# SSL Parameters
self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -553,6 +557,7 @@ class CrawlerRunConfig:
prettiify=kwargs.get("prettiify", False),
parser_type=kwargs.get("parser_type", "lxml"),
scraping_strategy=kwargs.get("scraping_strategy"),
proxy_config=kwargs.get("proxy_config"),
# SSL Parameters
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
# Caching Parameters
@@ -631,6 +636,7 @@ class CrawlerRunConfig:
"prettiify": self.prettiify,
"parser_type": self.parser_type,
"scraping_strategy": self.scraping_strategy,
"proxy_config": self.proxy_config,
"fetch_ssl_certificate": self.fetch_ssl_certificate,
"cache_mode": self.cache_mode,
"session_id": self.session_id,

View File

@@ -543,9 +543,9 @@ class BrowserManager:
or crawlerRunConfig.simulate_user
or crawlerRunConfig.magic
):
await context.add_init_script(load_js_script("navigator_overrider"))
await context.add_init_script(load_js_script("navigator_overrider"))
async def create_browser_context(self):
async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
"""
Creates and returns a new browser context with configured settings.
Applies text-only mode settings if text_mode is enabled in config.
@@ -627,6 +627,16 @@ class BrowserManager:
"device_scale_factor": 1.0,
"java_script_enabled": self.config.java_script_enabled,
}
if crawlerRunConfig:
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
if crawlerRunConfig.proxy_config:
proxy_settings = {
"server": crawlerRunConfig.proxy_config.get("server"),
"username": crawlerRunConfig.proxy_config.get("username"),
"password": crawlerRunConfig.proxy_config.get("password"),
}
context_settings["proxy"] = proxy_settings
if self.config.text_mode:
text_mode_settings = {
@@ -710,7 +720,7 @@ class BrowserManager:
context = self.contexts_by_config[config_signature]
else:
# Create and setup a new context
context = await self.create_browser_context()
context = await self.create_browser_context(crawlerRunConfig)
await self.setup_context(context, crawlerRunConfig)
self.contexts_by_config[config_signature] = context

View File

@@ -36,23 +36,33 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
```
## Rotating Proxies
Here's the corrected documentation:
Example using a proxy rotation service and updating `BrowserConfig` dynamically:
## Rotating Proxies [COMING SOON]
Example using a proxy rotation service dynamically:
```python
from crawl4ai.async_configs import BrowserConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
async def get_next_proxy():
# Your proxy rotation logic here
return {"server": "http://next.proxy.com:8080"}
browser_config = BrowserConfig()
async with AsyncWebCrawler(config=browser_config) as crawler:
# Update proxy for each request
for url in urls:
proxy = await get_next_proxy()
browser_config.proxy_config = proxy
result = await crawler.arun(url=url, config=browser_config)
async def main():
browser_config = BrowserConfig()
run_config = CrawlerRunConfig()
async with AsyncWebCrawler(config=browser_config) as crawler:
# For each URL, create a new run config with different proxy
for url in urls:
proxy = await get_next_proxy()
# Clone the config and update proxy - this creates a new browser context
current_config = run_config.clone(proxy_config=proxy)
result = await crawler.arun(url=url, config=current_config)
if __name__ == "__main__":
import asyncio
asyncio.run(main())
```