feat(proxy): add proxy configuration support to CrawlerRunConfig

Add proxy_config parameter to CrawlerRunConfig to support dynamic proxy configuration per crawl request. This enables users to specify different proxy settings for each crawl operation without modifying the browser config. - Added proxy_config parameter to CrawlerRunConfig - Updated BrowserManager to apply proxy settings from CrawlerRunConfig - Updated proxy-security documentation with new usage examples
2025-01-20 22:14:05 +08:00
parent 2cec527a22
commit 9247877037
4 changed files with 43 additions and 13 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
 ### [Added] 2025-01-20
 - Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request
 - Updated documentation with examples for using proxy configuration in crawl operations
 ### [Added] 2025-01-20
 - New LLM-powered schema generation utility for JsonElementExtractionStrategy
 - Support for automatic CSS and XPath schema generation using OpenAI or Ollama
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -270,6 +270,8 @@ class CrawlerRunConfig:
                           Default: "lxml".
        scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
                           Default: WebScrapingStrategy.
        proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                     If None, no additional proxy config. Default: None.
        # Caching Parameters
        cache_mode (CacheMode or None): Defines how caching is handled.
@@ -389,6 +391,7 @@ class CrawlerRunConfig:
        prettiify: bool = False,
        parser_type: str = "lxml",
        scraping_strategy: ContentScrapingStrategy = None,
        proxy_config: dict = None,
        # SSL Parameters
        fetch_ssl_certificate: bool = False,
        # Caching Parameters
@@ -457,6 +460,7 @@ class CrawlerRunConfig:
        self.prettiify = prettiify
        self.parser_type = parser_type
        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
        self.proxy_config = proxy_config
        # SSL Parameters
        self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -553,6 +557,7 @@ class CrawlerRunConfig:
            prettiify=kwargs.get("prettiify", False),
            parser_type=kwargs.get("parser_type", "lxml"),
            scraping_strategy=kwargs.get("scraping_strategy"),
            proxy_config=kwargs.get("proxy_config"),
            # SSL Parameters
            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
            # Caching Parameters
@@ -631,6 +636,7 @@ class CrawlerRunConfig:
            "prettiify": self.prettiify,
            "parser_type": self.parser_type,
            "scraping_strategy": self.scraping_strategy,
            "proxy_config": self.proxy_config,
            "fetch_ssl_certificate": self.fetch_ssl_certificate,
            "cache_mode": self.cache_mode,
            "session_id": self.session_id,
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -545,7 +545,7 @@ class BrowserManager:
            ):
                await context.add_init_script(load_js_script("navigator_overrider"))        
-    async def create_browser_context(self):
+    async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
        """
        Creates and returns a new browser context with configured settings.
        Applies text-only mode settings if text_mode is enabled in config.
@@ -628,6 +628,16 @@ class BrowserManager:
            "java_script_enabled": self.config.java_script_enabled,
        }
        if crawlerRunConfig:
            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
            if crawlerRunConfig.proxy_config:
                proxy_settings = {
                    "server": crawlerRunConfig.proxy_config.get("server"),
                    "username": crawlerRunConfig.proxy_config.get("username"),
                    "password": crawlerRunConfig.proxy_config.get("password"),
                }
                context_settings["proxy"] = proxy_settings
        if self.config.text_mode:
            text_mode_settings = {
                "has_touch": False,
@@ -710,7 +720,7 @@ class BrowserManager:
                    context = self.contexts_by_config[config_signature]
                else:
                    # Create and setup a new context
-                    context = await self.create_browser_context()
+                    context = await self.create_browser_context(crawlerRunConfig)
                    await self.setup_context(context, crawlerRunConfig)
                    self.contexts_by_config[config_signature] = context
--- a/docs/md_v2/advanced/proxy-security.md
+++ b/docs/md_v2/advanced/proxy-security.md
@@ -36,23 +36,33 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
    result = await crawler.arun(url="https://example.com")
 ```
-## Rotating Proxies
+Here's the corrected documentation:
-Example using a proxy rotation service and updating `BrowserConfig` dynamically:
+## Rotating Proxies [COMING SOON]
 Example using a proxy rotation service dynamically:
 ```python
-from crawl4ai.async_configs import BrowserConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 async def get_next_proxy():
    # Your proxy rotation logic here
    return {"server": "http://next.proxy.com:8080"}
 async def main():
    browser_config = BrowserConfig()
    run_config = CrawlerRunConfig()
    async with AsyncWebCrawler(config=browser_config) as crawler:
-    # Update proxy for each request
+        # For each URL, create a new run config with different proxy
        for url in urls:
            proxy = await get_next_proxy()
-        browser_config.proxy_config = proxy
+            # Clone the config and update proxy - this creates a new browser context
-        result = await crawler.arun(url=url, config=browser_config)
+            current_config = run_config.clone(proxy_config=proxy)
            result = await crawler.arun(url=url, config=current_config)
 if __name__ == "__main__":
    import asyncio
    asyncio.run(main())
 ```