feat(proxy): add proxy configuration support to CrawlerRunConfig
Add proxy_config parameter to CrawlerRunConfig to support dynamic proxy configuration per crawl request. This enables users to specify different proxy settings for each crawl operation without modifying the browser config. - Added proxy_config parameter to CrawlerRunConfig - Updated BrowserManager to apply proxy settings from CrawlerRunConfig - Updated proxy-security documentation with new usage examples
This commit is contained in:
@@ -1,3 +1,7 @@
|
|||||||
|
### [Added] 2025-01-20
|
||||||
|
- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request
|
||||||
|
- Updated documentation with examples for using proxy configuration in crawl operations
|
||||||
|
|
||||||
### [Added] 2025-01-20
|
### [Added] 2025-01-20
|
||||||
- New LLM-powered schema generation utility for JsonElementExtractionStrategy
|
- New LLM-powered schema generation utility for JsonElementExtractionStrategy
|
||||||
- Support for automatic CSS and XPath schema generation using OpenAI or Ollama
|
- Support for automatic CSS and XPath schema generation using OpenAI or Ollama
|
||||||
|
|||||||
@@ -270,6 +270,8 @@ class CrawlerRunConfig:
|
|||||||
Default: "lxml".
|
Default: "lxml".
|
||||||
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
||||||
Default: WebScrapingStrategy.
|
Default: WebScrapingStrategy.
|
||||||
|
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||||
|
If None, no additional proxy config. Default: None.
|
||||||
|
|
||||||
# Caching Parameters
|
# Caching Parameters
|
||||||
cache_mode (CacheMode or None): Defines how caching is handled.
|
cache_mode (CacheMode or None): Defines how caching is handled.
|
||||||
@@ -389,6 +391,7 @@ class CrawlerRunConfig:
|
|||||||
prettiify: bool = False,
|
prettiify: bool = False,
|
||||||
parser_type: str = "lxml",
|
parser_type: str = "lxml",
|
||||||
scraping_strategy: ContentScrapingStrategy = None,
|
scraping_strategy: ContentScrapingStrategy = None,
|
||||||
|
proxy_config: dict = None,
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
fetch_ssl_certificate: bool = False,
|
fetch_ssl_certificate: bool = False,
|
||||||
# Caching Parameters
|
# Caching Parameters
|
||||||
@@ -457,6 +460,7 @@ class CrawlerRunConfig:
|
|||||||
self.prettiify = prettiify
|
self.prettiify = prettiify
|
||||||
self.parser_type = parser_type
|
self.parser_type = parser_type
|
||||||
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
||||||
|
self.proxy_config = proxy_config
|
||||||
|
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||||||
@@ -553,6 +557,7 @@ class CrawlerRunConfig:
|
|||||||
prettiify=kwargs.get("prettiify", False),
|
prettiify=kwargs.get("prettiify", False),
|
||||||
parser_type=kwargs.get("parser_type", "lxml"),
|
parser_type=kwargs.get("parser_type", "lxml"),
|
||||||
scraping_strategy=kwargs.get("scraping_strategy"),
|
scraping_strategy=kwargs.get("scraping_strategy"),
|
||||||
|
proxy_config=kwargs.get("proxy_config"),
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||||||
# Caching Parameters
|
# Caching Parameters
|
||||||
@@ -631,6 +636,7 @@ class CrawlerRunConfig:
|
|||||||
"prettiify": self.prettiify,
|
"prettiify": self.prettiify,
|
||||||
"parser_type": self.parser_type,
|
"parser_type": self.parser_type,
|
||||||
"scraping_strategy": self.scraping_strategy,
|
"scraping_strategy": self.scraping_strategy,
|
||||||
|
"proxy_config": self.proxy_config,
|
||||||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||||||
"cache_mode": self.cache_mode,
|
"cache_mode": self.cache_mode,
|
||||||
"session_id": self.session_id,
|
"session_id": self.session_id,
|
||||||
|
|||||||
@@ -545,7 +545,7 @@ class BrowserManager:
|
|||||||
):
|
):
|
||||||
await context.add_init_script(load_js_script("navigator_overrider"))
|
await context.add_init_script(load_js_script("navigator_overrider"))
|
||||||
|
|
||||||
async def create_browser_context(self):
|
async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
|
||||||
"""
|
"""
|
||||||
Creates and returns a new browser context with configured settings.
|
Creates and returns a new browser context with configured settings.
|
||||||
Applies text-only mode settings if text_mode is enabled in config.
|
Applies text-only mode settings if text_mode is enabled in config.
|
||||||
@@ -628,6 +628,16 @@ class BrowserManager:
|
|||||||
"java_script_enabled": self.config.java_script_enabled,
|
"java_script_enabled": self.config.java_script_enabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if crawlerRunConfig:
|
||||||
|
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
||||||
|
if crawlerRunConfig.proxy_config:
|
||||||
|
proxy_settings = {
|
||||||
|
"server": crawlerRunConfig.proxy_config.get("server"),
|
||||||
|
"username": crawlerRunConfig.proxy_config.get("username"),
|
||||||
|
"password": crawlerRunConfig.proxy_config.get("password"),
|
||||||
|
}
|
||||||
|
context_settings["proxy"] = proxy_settings
|
||||||
|
|
||||||
if self.config.text_mode:
|
if self.config.text_mode:
|
||||||
text_mode_settings = {
|
text_mode_settings = {
|
||||||
"has_touch": False,
|
"has_touch": False,
|
||||||
@@ -710,7 +720,7 @@ class BrowserManager:
|
|||||||
context = self.contexts_by_config[config_signature]
|
context = self.contexts_by_config[config_signature]
|
||||||
else:
|
else:
|
||||||
# Create and setup a new context
|
# Create and setup a new context
|
||||||
context = await self.create_browser_context()
|
context = await self.create_browser_context(crawlerRunConfig)
|
||||||
await self.setup_context(context, crawlerRunConfig)
|
await self.setup_context(context, crawlerRunConfig)
|
||||||
self.contexts_by_config[config_signature] = context
|
self.contexts_by_config[config_signature] = context
|
||||||
|
|
||||||
|
|||||||
@@ -36,23 +36,33 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
|
|||||||
result = await crawler.arun(url="https://example.com")
|
result = await crawler.arun(url="https://example.com")
|
||||||
```
|
```
|
||||||
|
|
||||||
## Rotating Proxies
|
Here's the corrected documentation:
|
||||||
|
|
||||||
Example using a proxy rotation service and updating `BrowserConfig` dynamically:
|
## Rotating Proxies [COMING SOON]
|
||||||
|
|
||||||
|
Example using a proxy rotation service dynamically:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import BrowserConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
|
||||||
async def get_next_proxy():
|
async def get_next_proxy():
|
||||||
# Your proxy rotation logic here
|
# Your proxy rotation logic here
|
||||||
return {"server": "http://next.proxy.com:8080"}
|
return {"server": "http://next.proxy.com:8080"}
|
||||||
|
|
||||||
browser_config = BrowserConfig()
|
async def main():
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
browser_config = BrowserConfig()
|
||||||
# Update proxy for each request
|
run_config = CrawlerRunConfig()
|
||||||
for url in urls:
|
|
||||||
proxy = await get_next_proxy()
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
browser_config.proxy_config = proxy
|
# For each URL, create a new run config with different proxy
|
||||||
result = await crawler.arun(url=url, config=browser_config)
|
for url in urls:
|
||||||
|
proxy = await get_next_proxy()
|
||||||
|
# Clone the config and update proxy - this creates a new browser context
|
||||||
|
current_config = run_config.clone(proxy_config=proxy)
|
||||||
|
result = await crawler.arun(url=url, config=current_config)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user