diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index edcb4b4e..0e39b551 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -650,6 +650,12 @@ class CrawlerRunConfig(): user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. Default: None. + # Experimental Parameters + experimental (dict): Dictionary containing experimental parameters that are in beta phase. + This allows passing temporary features that are not yet fully integrated + into the main parameter set. + Default: None. + url: str = None # This is not a compulsory parameter """ @@ -732,6 +738,8 @@ class CrawlerRunConfig(): user_agent_generator_config: dict = {}, # Deep Crawl Parameters deep_crawl_strategy: Optional[DeepCrawlStrategy] = None, + # Experimental Parameters + experimental: Dict[str, Any] = None, ): # TODO: Planning to set properties dynamically based on the __init__ signature self.url = url @@ -845,6 +853,9 @@ class CrawlerRunConfig(): # Deep Crawl Parameters self.deep_crawl_strategy = deep_crawl_strategy + + # Experimental Parameters + self.experimental = experimental or {} def __getattr__(self, name): @@ -953,6 +964,8 @@ class CrawlerRunConfig(): # Deep Crawl Parameters deep_crawl_strategy=kwargs.get("deep_crawl_strategy"), url=kwargs.get("url"), + # Experimental Parameters + experimental=kwargs.get("experimental"), ) # Create a funciton returns dict of the object @@ -1037,6 +1050,7 @@ class CrawlerRunConfig(): "user_agent_generator_config": self.user_agent_generator_config, "deep_crawl_strategy": self.deep_crawl_strategy, "url": self.url, + "experimental": self.experimental, } def clone(self, **kwargs): diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 960c2d6f..37aa0962 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -507,10 +507,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Get page for session page, context = await self.browser_manager.get_page(crawlerRunConfig=config) + # await page.goto(URL) + # Add default cookie - await context.add_cookies( - [{"name": "cookiesEnabled", "value": "true", "url": url}] - ) + # await context.add_cookies( + # [{"name": "cookiesEnabled", "value": "true", "url": url}] + # ) # Handle navigator overrides if config.override_navigator or config.simulate_user or config.magic: @@ -562,14 +564,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): try: # Generate a unique nonce for this request - nonce = hashlib.sha256(os.urandom(32)).hexdigest() + if config.experimental.get("use_csp_nonce", False): + nonce = hashlib.sha256(os.urandom(32)).hexdigest() - # Add CSP headers to the request - await page.set_extra_http_headers( - { - "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'" - } - ) + # Add CSP headers to the request + await page.set_extra_http_headers( + { + "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'" + } + ) response = await page.goto( url, wait_until=config.wait_until, timeout=config.page_timeout diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 38f87d9a..06b36a32 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -443,19 +443,6 @@ class BrowserManager: self.default_context = contexts[0] else: self.default_context = await self.create_browser_context() - # self.default_context = await self.browser.new_context( - # viewport={ - # "width": self.config.viewport_width, - # "height": self.config.viewport_height, - # }, - # storage_state=self.config.storage_state, - # user_agent=self.config.headers.get( - # "User-Agent", self.config.user_agent - # ), - # accept_downloads=self.config.accept_downloads, - # ignore_https_errors=self.config.ignore_https_errors, - # java_script_enabled=self.config.java_script_enabled, - # ) await self.setup_context(self.default_context) else: browser_args = self._build_browser_args() @@ -470,6 +457,7 @@ class BrowserManager: self.default_context = self.browser + def _build_browser_args(self) -> dict: """Build browser launch arguments from config.""" args = [ diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 305e8a31..33802772 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -391,7 +391,10 @@ async def handle_crawl_request( async with AsyncWebCrawler(config=browser_config) as crawler: results = [] func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many") - partial_func = partial(func, urls[0] if len(urls) == 1 else urls, config=crawler_config, dispatcher=dispatcher) + partial_func = partial(func, + urls[0] if len(urls) == 1 else urls, + config=crawler_config, + dispatcher=dispatcher) results = await partial_func() return { "success": True,