feat(proxy): add proxy configuration support to CrawlerRunConfig
Add proxy_config parameter to CrawlerRunConfig to support dynamic proxy configuration per crawl request. This enables users to specify different proxy settings for each crawl operation without modifying the browser config. - Added proxy_config parameter to CrawlerRunConfig - Updated BrowserManager to apply proxy settings from CrawlerRunConfig - Updated proxy-security documentation with new usage examples
This commit is contained in:
@@ -1,3 +1,7 @@
|
||||
### [Added] 2025-01-20
|
||||
- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request
|
||||
- Updated documentation with examples for using proxy configuration in crawl operations
|
||||
|
||||
### [Added] 2025-01-20
|
||||
- New LLM-powered schema generation utility for JsonElementExtractionStrategy
|
||||
- Support for automatic CSS and XPath schema generation using OpenAI or Ollama
|
||||
|
||||
@@ -270,6 +270,8 @@ class CrawlerRunConfig:
|
||||
Default: "lxml".
|
||||
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
||||
Default: WebScrapingStrategy.
|
||||
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
|
||||
# Caching Parameters
|
||||
cache_mode (CacheMode or None): Defines how caching is handled.
|
||||
@@ -389,6 +391,7 @@ class CrawlerRunConfig:
|
||||
prettiify: bool = False,
|
||||
parser_type: str = "lxml",
|
||||
scraping_strategy: ContentScrapingStrategy = None,
|
||||
proxy_config: dict = None,
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate: bool = False,
|
||||
# Caching Parameters
|
||||
@@ -457,6 +460,7 @@ class CrawlerRunConfig:
|
||||
self.prettiify = prettiify
|
||||
self.parser_type = parser_type
|
||||
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
||||
self.proxy_config = proxy_config
|
||||
|
||||
# SSL Parameters
|
||||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||||
@@ -553,6 +557,7 @@ class CrawlerRunConfig:
|
||||
prettiify=kwargs.get("prettiify", False),
|
||||
parser_type=kwargs.get("parser_type", "lxml"),
|
||||
scraping_strategy=kwargs.get("scraping_strategy"),
|
||||
proxy_config=kwargs.get("proxy_config"),
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||||
# Caching Parameters
|
||||
@@ -631,6 +636,7 @@ class CrawlerRunConfig:
|
||||
"prettiify": self.prettiify,
|
||||
"parser_type": self.parser_type,
|
||||
"scraping_strategy": self.scraping_strategy,
|
||||
"proxy_config": self.proxy_config,
|
||||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||||
"cache_mode": self.cache_mode,
|
||||
"session_id": self.session_id,
|
||||
|
||||
@@ -543,9 +543,9 @@ class BrowserManager:
|
||||
or crawlerRunConfig.simulate_user
|
||||
or crawlerRunConfig.magic
|
||||
):
|
||||
await context.add_init_script(load_js_script("navigator_overrider"))
|
||||
await context.add_init_script(load_js_script("navigator_overrider"))
|
||||
|
||||
async def create_browser_context(self):
|
||||
async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
|
||||
"""
|
||||
Creates and returns a new browser context with configured settings.
|
||||
Applies text-only mode settings if text_mode is enabled in config.
|
||||
@@ -627,6 +627,16 @@ class BrowserManager:
|
||||
"device_scale_factor": 1.0,
|
||||
"java_script_enabled": self.config.java_script_enabled,
|
||||
}
|
||||
|
||||
if crawlerRunConfig:
|
||||
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
||||
if crawlerRunConfig.proxy_config:
|
||||
proxy_settings = {
|
||||
"server": crawlerRunConfig.proxy_config.get("server"),
|
||||
"username": crawlerRunConfig.proxy_config.get("username"),
|
||||
"password": crawlerRunConfig.proxy_config.get("password"),
|
||||
}
|
||||
context_settings["proxy"] = proxy_settings
|
||||
|
||||
if self.config.text_mode:
|
||||
text_mode_settings = {
|
||||
@@ -710,7 +720,7 @@ class BrowserManager:
|
||||
context = self.contexts_by_config[config_signature]
|
||||
else:
|
||||
# Create and setup a new context
|
||||
context = await self.create_browser_context()
|
||||
context = await self.create_browser_context(crawlerRunConfig)
|
||||
await self.setup_context(context, crawlerRunConfig)
|
||||
self.contexts_by_config[config_signature] = context
|
||||
|
||||
|
||||
@@ -36,23 +36,33 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
|
||||
## Rotating Proxies
|
||||
Here's the corrected documentation:
|
||||
|
||||
Example using a proxy rotation service and updating `BrowserConfig` dynamically:
|
||||
## Rotating Proxies [COMING SOON]
|
||||
|
||||
Example using a proxy rotation service dynamically:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def get_next_proxy():
|
||||
# Your proxy rotation logic here
|
||||
return {"server": "http://next.proxy.com:8080"}
|
||||
|
||||
browser_config = BrowserConfig()
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Update proxy for each request
|
||||
for url in urls:
|
||||
proxy = await get_next_proxy()
|
||||
browser_config.proxy_config = proxy
|
||||
result = await crawler.arun(url=url, config=browser_config)
|
||||
async def main():
|
||||
browser_config = BrowserConfig()
|
||||
run_config = CrawlerRunConfig()
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# For each URL, create a new run config with different proxy
|
||||
for url in urls:
|
||||
proxy = await get_next_proxy()
|
||||
# Clone the config and update proxy - this creates a new browser context
|
||||
current_config = run_config.clone(proxy_config=proxy)
|
||||
result = await crawler.arun(url=url, config=current_config)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
|
||||
Reference in New Issue
Block a user