From 7884a98be7fe891963e9c82525b25660ff86c26e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 14 Mar 2025 14:39:24 +0800 Subject: [PATCH] feat(crawler): add experimental parameters support and optimize browser handling Add experimental parameters dictionary to CrawlerRunConfig to support beta features Make CSP nonce headers optional via experimental config Remove default cookie injection Clean up browser context creation code Improve code formatting in API handler BREAKING CHANGE: Default cookie injection has been removed from page initialization --- crawl4ai/async_configs.py | 14 ++++++++++++++ crawl4ai/async_crawler_strategy.py | 23 +++++++++++++---------- crawl4ai/browser_manager.py | 14 +------------- deploy/docker/api.py | 5 ++++- 4 files changed, 32 insertions(+), 24 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index edcb4b4e..0e39b551 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -650,6 +650,12 @@ class CrawlerRunConfig(): user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. Default: None. + # Experimental Parameters + experimental (dict): Dictionary containing experimental parameters that are in beta phase. + This allows passing temporary features that are not yet fully integrated + into the main parameter set. + Default: None. + url: str = None # This is not a compulsory parameter """ @@ -732,6 +738,8 @@ class CrawlerRunConfig(): user_agent_generator_config: dict = {}, # Deep Crawl Parameters deep_crawl_strategy: Optional[DeepCrawlStrategy] = None, + # Experimental Parameters + experimental: Dict[str, Any] = None, ): # TODO: Planning to set properties dynamically based on the __init__ signature self.url = url @@ -845,6 +853,9 @@ class CrawlerRunConfig(): # Deep Crawl Parameters self.deep_crawl_strategy = deep_crawl_strategy + + # Experimental Parameters + self.experimental = experimental or {} def __getattr__(self, name): @@ -953,6 +964,8 @@ class CrawlerRunConfig(): # Deep Crawl Parameters deep_crawl_strategy=kwargs.get("deep_crawl_strategy"), url=kwargs.get("url"), + # Experimental Parameters + experimental=kwargs.get("experimental"), ) # Create a funciton returns dict of the object @@ -1037,6 +1050,7 @@ class CrawlerRunConfig(): "user_agent_generator_config": self.user_agent_generator_config, "deep_crawl_strategy": self.deep_crawl_strategy, "url": self.url, + "experimental": self.experimental, } def clone(self, **kwargs): diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 960c2d6f..37aa0962 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -507,10 +507,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Get page for session page, context = await self.browser_manager.get_page(crawlerRunConfig=config) + # await page.goto(URL) + # Add default cookie - await context.add_cookies( - [{"name": "cookiesEnabled", "value": "true", "url": url}] - ) + # await context.add_cookies( + # [{"name": "cookiesEnabled", "value": "true", "url": url}] + # ) # Handle navigator overrides if config.override_navigator or config.simulate_user or config.magic: @@ -562,14 +564,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): try: # Generate a unique nonce for this request - nonce = hashlib.sha256(os.urandom(32)).hexdigest() + if config.experimental.get("use_csp_nonce", False): + nonce = hashlib.sha256(os.urandom(32)).hexdigest() - # Add CSP headers to the request - await page.set_extra_http_headers( - { - "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'" - } - ) + # Add CSP headers to the request + await page.set_extra_http_headers( + { + "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'" + } + ) response = await page.goto( url, wait_until=config.wait_until, timeout=config.page_timeout diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 38f87d9a..06b36a32 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -443,19 +443,6 @@ class BrowserManager: self.default_context = contexts[0] else: self.default_context = await self.create_browser_context() - # self.default_context = await self.browser.new_context( - # viewport={ - # "width": self.config.viewport_width, - # "height": self.config.viewport_height, - # }, - # storage_state=self.config.storage_state, - # user_agent=self.config.headers.get( - # "User-Agent", self.config.user_agent - # ), - # accept_downloads=self.config.accept_downloads, - # ignore_https_errors=self.config.ignore_https_errors, - # java_script_enabled=self.config.java_script_enabled, - # ) await self.setup_context(self.default_context) else: browser_args = self._build_browser_args() @@ -470,6 +457,7 @@ class BrowserManager: self.default_context = self.browser + def _build_browser_args(self) -> dict: """Build browser launch arguments from config.""" args = [ diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 305e8a31..33802772 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -391,7 +391,10 @@ async def handle_crawl_request( async with AsyncWebCrawler(config=browser_config) as crawler: results = [] func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many") - partial_func = partial(func, urls[0] if len(urls) == 1 else urls, config=crawler_config, dispatcher=dispatcher) + partial_func = partial(func, + urls[0] if len(urls) == 1 else urls, + config=crawler_config, + dispatcher=dispatcher) results = await partial_func() return { "success": True,