feat(proxy): add proxy configuration support to CrawlerRunConfig

Add proxy_config parameter to CrawlerRunConfig to support dynamic proxy configuration per crawl request. This enables users to specify different proxy settings for each crawl operation without modifying the browser config. - Added proxy_config parameter to CrawlerRunConfig - Updated BrowserManager to apply proxy settings from CrawlerRunConfig - Updated proxy-security documentation with new usage examples
2025-01-20 22:14:05 +08:00
parent 2cec527a22
commit 9247877037
4 changed files with 43 additions and 13 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+### [Added] 2025-01-20
+- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request
+- Updated documentation with examples for using proxy configuration in crawl operations
+
 ### [Added] 2025-01-20
 - New LLM-powered schema generation utility for JsonElementExtractionStrategy
 - Support for automatic CSS and XPath schema generation using OpenAI or Ollama
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -270,6 +270,8 @@ class CrawlerRunConfig:
                           Default: "lxml".
        scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
                           Default: WebScrapingStrategy.
+        proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
+                                     If None, no additional proxy config. Default: None.

        # Caching Parameters
        cache_mode (CacheMode or None): Defines how caching is handled.
@@ -389,6 +391,7 @@ class CrawlerRunConfig:
        prettiify: bool = False,
        parser_type: str = "lxml",
        scraping_strategy: ContentScrapingStrategy = None,
+        proxy_config: dict = None,
        # SSL Parameters
        fetch_ssl_certificate: bool = False,
        # Caching Parameters
@@ -457,6 +460,7 @@ class CrawlerRunConfig:
        self.prettiify = prettiify
        self.parser_type = parser_type
        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
+        self.proxy_config = proxy_config

        # SSL Parameters
        self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -553,6 +557,7 @@ class CrawlerRunConfig:
            prettiify=kwargs.get("prettiify", False),
            parser_type=kwargs.get("parser_type", "lxml"),
            scraping_strategy=kwargs.get("scraping_strategy"),
+            proxy_config=kwargs.get("proxy_config"),
            # SSL Parameters
            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
            # Caching Parameters
@@ -631,6 +636,7 @@ class CrawlerRunConfig:
            "prettiify": self.prettiify,
            "parser_type": self.parser_type,
            "scraping_strategy": self.scraping_strategy,
+            "proxy_config": self.proxy_config,
            "fetch_ssl_certificate": self.fetch_ssl_certificate,
            "cache_mode": self.cache_mode,
            "session_id": self.session_id,
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -543,9 +543,9 @@ class BrowserManager:
                or crawlerRunConfig.simulate_user
                or crawlerRunConfig.magic
            ):
-                await context.add_init_script(load_js_script("navigator_overrider"))
+                await context.add_init_script(load_js_script("navigator_overrider"))        

-    async def create_browser_context(self):
+    async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
        """
        Creates and returns a new browser context with configured settings.
        Applies text-only mode settings if text_mode is enabled in config.
@@ -627,6 +627,16 @@ class BrowserManager:
            "device_scale_factor": 1.0,
            "java_script_enabled": self.config.java_script_enabled,
        }
+        
+        if crawlerRunConfig:
+            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
+            if crawlerRunConfig.proxy_config:
+                proxy_settings = {
+                    "server": crawlerRunConfig.proxy_config.get("server"),
+                    "username": crawlerRunConfig.proxy_config.get("username"),
+                    "password": crawlerRunConfig.proxy_config.get("password"),
+                }
+                context_settings["proxy"] = proxy_settings

        if self.config.text_mode:
            text_mode_settings = {
@@ -710,7 +720,7 @@ class BrowserManager:
                    context = self.contexts_by_config[config_signature]
                else:
                    # Create and setup a new context
-                    context = await self.create_browser_context()
+                    context = await self.create_browser_context(crawlerRunConfig)
                    await self.setup_context(context, crawlerRunConfig)
                    self.contexts_by_config[config_signature] = context

--- a/docs/md_v2/advanced/proxy-security.md
+++ b/docs/md_v2/advanced/proxy-security.md
@@ -36,23 +36,33 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
    result = await crawler.arun(url="https://example.com")
 ```

-## Rotating Proxies
+Here's the corrected documentation:

-Example using a proxy rotation service and updating `BrowserConfig` dynamically:
+## Rotating Proxies [COMING SOON]
+
+Example using a proxy rotation service dynamically:

 ```python
-from crawl4ai.async_configs import BrowserConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig

 async def get_next_proxy():
    # Your proxy rotation logic here
    return {"server": "http://next.proxy.com:8080"}

-browser_config = BrowserConfig()
-async with AsyncWebCrawler(config=browser_config) as crawler:
-    # Update proxy for each request
-    for url in urls:
-        proxy = await get_next_proxy()
-        browser_config.proxy_config = proxy
-        result = await crawler.arun(url=url, config=browser_config)
+async def main():
+    browser_config = BrowserConfig()
+    run_config = CrawlerRunConfig()
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # For each URL, create a new run config with different proxy
+        for url in urls:
+            proxy = await get_next_proxy()
+            # Clone the config and update proxy - this creates a new browser context
+            current_config = run_config.clone(proxy_config=proxy)
+            result = await crawler.arun(url=url, config=current_config)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
 ```