From 9247877037395dbf9b2fca67241a134724ec0155 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 20 Jan 2025 22:14:05 +0800 Subject: [PATCH] feat(proxy): add proxy configuration support to CrawlerRunConfig Add proxy_config parameter to CrawlerRunConfig to support dynamic proxy configuration per crawl request. This enables users to specify different proxy settings for each crawl operation without modifying the browser config. - Added proxy_config parameter to CrawlerRunConfig - Updated BrowserManager to apply proxy settings from CrawlerRunConfig - Updated proxy-security documentation with new usage examples --- CHANGELOG.md | 4 ++++ crawl4ai/async_configs.py | 6 ++++++ crawl4ai/async_crawler_strategy.py | 16 +++++++++++--- docs/md_v2/advanced/proxy-security.md | 30 ++++++++++++++++++--------- 4 files changed, 43 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bea14df..93bd9bdc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +### [Added] 2025-01-20 +- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request +- Updated documentation with examples for using proxy configuration in crawl operations + ### [Added] 2025-01-20 - New LLM-powered schema generation utility for JsonElementExtractionStrategy - Support for automatic CSS and XPath schema generation using OpenAI or Ollama diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index f4914726..fbcb6e70 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -270,6 +270,8 @@ class CrawlerRunConfig: Default: "lxml". scraping_strategy (ContentScrapingStrategy): Scraping strategy to use. Default: WebScrapingStrategy. + proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. + If None, no additional proxy config. Default: None. # Caching Parameters cache_mode (CacheMode or None): Defines how caching is handled. @@ -389,6 +391,7 @@ class CrawlerRunConfig: prettiify: bool = False, parser_type: str = "lxml", scraping_strategy: ContentScrapingStrategy = None, + proxy_config: dict = None, # SSL Parameters fetch_ssl_certificate: bool = False, # Caching Parameters @@ -457,6 +460,7 @@ class CrawlerRunConfig: self.prettiify = prettiify self.parser_type = parser_type self.scraping_strategy = scraping_strategy or WebScrapingStrategy() + self.proxy_config = proxy_config # SSL Parameters self.fetch_ssl_certificate = fetch_ssl_certificate @@ -553,6 +557,7 @@ class CrawlerRunConfig: prettiify=kwargs.get("prettiify", False), parser_type=kwargs.get("parser_type", "lxml"), scraping_strategy=kwargs.get("scraping_strategy"), + proxy_config=kwargs.get("proxy_config"), # SSL Parameters fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), # Caching Parameters @@ -631,6 +636,7 @@ class CrawlerRunConfig: "prettiify": self.prettiify, "parser_type": self.parser_type, "scraping_strategy": self.scraping_strategy, + "proxy_config": self.proxy_config, "fetch_ssl_certificate": self.fetch_ssl_certificate, "cache_mode": self.cache_mode, "session_id": self.session_id, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 786d2fb9..ae1788f1 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -543,9 +543,9 @@ class BrowserManager: or crawlerRunConfig.simulate_user or crawlerRunConfig.magic ): - await context.add_init_script(load_js_script("navigator_overrider")) + await context.add_init_script(load_js_script("navigator_overrider")) - async def create_browser_context(self): + async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None): """ Creates and returns a new browser context with configured settings. Applies text-only mode settings if text_mode is enabled in config. @@ -627,6 +627,16 @@ class BrowserManager: "device_scale_factor": 1.0, "java_script_enabled": self.config.java_script_enabled, } + + if crawlerRunConfig: + # Check if there is value for crawlerRunConfig.proxy_config set add that to context + if crawlerRunConfig.proxy_config: + proxy_settings = { + "server": crawlerRunConfig.proxy_config.get("server"), + "username": crawlerRunConfig.proxy_config.get("username"), + "password": crawlerRunConfig.proxy_config.get("password"), + } + context_settings["proxy"] = proxy_settings if self.config.text_mode: text_mode_settings = { @@ -710,7 +720,7 @@ class BrowserManager: context = self.contexts_by_config[config_signature] else: # Create and setup a new context - context = await self.create_browser_context() + context = await self.create_browser_context(crawlerRunConfig) await self.setup_context(context, crawlerRunConfig) self.contexts_by_config[config_signature] = context diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md index b98c17e5..9b64fd84 100644 --- a/docs/md_v2/advanced/proxy-security.md +++ b/docs/md_v2/advanced/proxy-security.md @@ -36,23 +36,33 @@ async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url="https://example.com") ``` -## Rotating Proxies +Here's the corrected documentation: -Example using a proxy rotation service and updating `BrowserConfig` dynamically: +## Rotating Proxies [COMING SOON] + +Example using a proxy rotation service dynamically: ```python -from crawl4ai.async_configs import BrowserConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig async def get_next_proxy(): # Your proxy rotation logic here return {"server": "http://next.proxy.com:8080"} -browser_config = BrowserConfig() -async with AsyncWebCrawler(config=browser_config) as crawler: - # Update proxy for each request - for url in urls: - proxy = await get_next_proxy() - browser_config.proxy_config = proxy - result = await crawler.arun(url=url, config=browser_config) +async def main(): + browser_config = BrowserConfig() + run_config = CrawlerRunConfig() + + async with AsyncWebCrawler(config=browser_config) as crawler: + # For each URL, create a new run config with different proxy + for url in urls: + proxy = await get_next_proxy() + # Clone the config and update proxy - this creates a new browser context + current_config = run_config.clone(proxy_config=proxy) + result = await crawler.arun(url=url, config=current_config) + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) ```