diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index a5250455..4677e1d5 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -112,6 +112,9 @@ class BrowserConfig: self.user_data_dir = user_data_dir self.chrome_channel = chrome_channel or self.browser_type or "chromium" self.channel = channel or self.browser_type or "chromium" + if self.browser_type in ["firefox", "webkit"]: + self.channel = "" + self.chrome_channel = "" self.proxy = proxy self.proxy_config = proxy_config self.viewport_width = viewport_width @@ -239,6 +242,8 @@ class CrawlerRunConfig: Default: False. no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY. Default: False. + shared_data (dict or None): Shared data to be passed between hooks. + Default: None. # Page Navigation and Timing Parameters wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded". @@ -344,6 +349,7 @@ class CrawlerRunConfig: disable_cache: bool = False, no_cache_read: bool = False, no_cache_write: bool = False, + shared_data: dict = None, # Page Navigation and Timing Parameters wait_until: str = "domcontentloaded", page_timeout: int = PAGE_TIMEOUT, @@ -411,6 +417,7 @@ class CrawlerRunConfig: self.disable_cache = disable_cache self.no_cache_read = no_cache_read self.no_cache_write = no_cache_write + self.shared_data = shared_data # Page Navigation and Timing Parameters self.wait_until = wait_until @@ -501,6 +508,7 @@ class CrawlerRunConfig: disable_cache=kwargs.get("disable_cache", False), no_cache_read=kwargs.get("no_cache_read", False), no_cache_write=kwargs.get("no_cache_write", False), + shared_data=kwargs.get("shared_data", None), # Page Navigation and Timing Parameters wait_until=kwargs.get("wait_until", "domcontentloaded"), page_timeout=kwargs.get("page_timeout", 60000), @@ -574,6 +582,8 @@ class CrawlerRunConfig: "disable_cache": self.disable_cache, "no_cache_read": self.no_cache_read, "no_cache_write": self.no_cache_write, + "shared_data": self.shared_data, + # Page Navigation and Timing Parameters "wait_until": self.wait_until, "page_timeout": self.page_timeout, "wait_for": self.wait_for, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 60590035..758157a5 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -326,6 +326,10 @@ class BrowserManager: self.sessions = {} self.session_ttl = 1800 # 30 minutes + # Keep track of contexts by a "config signature," so each unique config reuses a single context + self.contexts_by_config = {} + self._contexts_lock = asyncio.Lock() + # Initialize ManagedBrowser if needed if self.config.use_managed_browser: self.managed_browser = ManagedBrowser( @@ -642,7 +646,38 @@ class BrowserManager: await context.route(f"**/*.{ext}", lambda route: route.abort()) return context - # async def get_page(self, session_id: Optional[str], user_agent: str): + def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: + """ + Converts the crawlerRunConfig into a dict, excludes ephemeral fields, + then returns a hash of the sorted JSON. This yields a stable signature + that identifies configurations requiring a unique browser context. + """ + import json, hashlib + + config_dict = crawlerRunConfig.__dict__.copy() + # Exclude items that do not affect browser-level setup. + # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config. + ephemeral_keys = [ + "session_id", + "js_code", + "scraping_strategy", + "extraction_strategy", + "chunking_strategy", + "cache_mode", + "content_filter", + "semaphore_count", + "url" + ] + for key in ephemeral_keys: + if key in config_dict: + del config_dict[key] + # Convert to canonical JSON string + signature_json = json.dumps(config_dict, sort_keys=True, default=str) + + # Hash the JSON so we get a compact, unique string + signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest() + return signature_hash + async def get_page(self, crawlerRunConfig: CrawlerRunConfig): """ Get a page for the given session ID, creating a new one if needed. @@ -651,24 +686,38 @@ class BrowserManager: crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings Returns: - Page: The page object for the given session ID. - BrowserContext: The browser context for the given session ID. + (page, context): The Page and its BrowserContext """ self._cleanup_expired_sessions() + # If a session_id is provided and we already have it, reuse that page + context if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: context, page, _ = self.sessions[crawlerRunConfig.session_id] + # Update last-used timestamp self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) return page, context + # If using a managed browser, just grab the shared default_context if self.config.use_managed_browser: context = self.default_context page = await context.new_page() else: - context = await self.create_browser_context() - await self.setup_context(context, crawlerRunConfig) + # Otherwise, check if we have an existing context for this config + config_signature = self._make_config_signature(crawlerRunConfig) + + async with self._contexts_lock: + if config_signature in self.contexts_by_config: + context = self.contexts_by_config[config_signature] + else: + # Create and setup a new context + context = await self.create_browser_context() + await self.setup_context(context, crawlerRunConfig) + self.contexts_by_config[config_signature] = context + + # Create a new page from the chosen context page = await context.new_page() + # If a session_id is specified, store this session so we can reuse later if crawlerRunConfig.session_id: self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) @@ -708,6 +757,18 @@ class BrowserManager: for session_id in session_ids: await self.kill_session(session_id) + # Now close all contexts we created. This reclaims memory from ephemeral contexts. + for ctx in self.contexts_by_config.values(): + try: + await ctx.close() + except Exception as e: + self.logger.error( + message="Error closing context: {error}", + tag="ERROR", + params={"error": str(e)} + ) + self.contexts_by_config.clear() + if self.browser: await self.browser.close() self.browser = None @@ -1204,7 +1265,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await context.add_init_script(load_js_script("navigator_overrider")) # Call hook after page creation - await self.execute_hook("on_page_context_created", page, context=context) + await self.execute_hook("on_page_context_created", page, context=context, config=config) # Set up console logging if requested if config.log_console: @@ -1245,7 +1306,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Handle page navigation and content loading if not config.js_only: - await self.execute_hook("before_goto", page, context=context, url=url) + await self.execute_hook("before_goto", page, context=context, url=url, config=config) try: # Generate a unique nonce for this request @@ -1265,7 +1326,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") await self.execute_hook( - "after_goto", page, context=context, url=url, response=response + "after_goto", page, context=context, url=url, response=response, config=config ) if response is None: @@ -1439,7 +1500,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): params={"error": execution_result.get("error")}, ) - await self.execute_hook("on_execution_started", page, context=context) + await self.execute_hook("on_execution_started", page, context=context, config=config) # Handle user simulation if config.simulate_user or config.magic: @@ -1482,7 +1543,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page = await self.process_iframes(page) # Pre-content retrieval hooks and delay - await self.execute_hook("before_retrieve_html", page, context=context) + await self.execute_hook("before_retrieve_html", page, context=context, config=config) if config.delay_before_return_html: await asyncio.sleep(config.delay_before_return_html) @@ -1493,7 +1554,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Get final HTML content html = await page.content() await self.execute_hook( - "before_return_html", page=page, html=html, context=context + "before_return_html", page=page, html=html, context=context, config=config ) # Handle PDF and screenshot generation @@ -1989,10 +2050,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): t1 = time.time() try: await page.wait_for_load_state("domcontentloaded", timeout=5000) - print( - "DOM content loaded after script execution in", - time.time() - t1, - ) except Error as e: self.logger.warning( message="DOM content load timeout: {error}", @@ -2099,13 +2156,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Wait for network idle after script execution t1 = time.time() await page.wait_for_load_state("domcontentloaded", timeout=5000) - print( - "DOM content loaded after script execution in", time.time() - t1 - ) + t1 = time.time() await page.wait_for_load_state("networkidle", timeout=5000) - print("Network idle after script execution in", time.time() - t1) results.append(result if result else {"success": True}) diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py index a2a02da8..b58443bd 100644 --- a/docs/examples/quickstart_async.config.py +++ b/docs/examples/quickstart_async.config.py @@ -231,7 +231,7 @@ async def extract_structured_data_using_css_extractor(): print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") schema = { "name": "KidoCode Courses", - "baseSelector": "section.charge-methodology .w-tab-content > div", + "baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item", "fields": [ { "name": "section_title", @@ -279,6 +279,7 @@ async def extract_structured_data_using_css_extractor(): cache_mode=CacheMode.BYPASS, extraction_strategy=JsonCssExtractionStrategy(schema), js_code=[js_click_tabs], + delay_before_return_html=1 ) async with AsyncWebCrawler(config=browser_config) as crawler: @@ -591,29 +592,26 @@ async def speed_comparison(): # Main execution async def main(): # Basic examples - # await simple_crawl() - # await simple_example_with_running_js_code() - # await simple_example_with_css_selector() + await simple_crawl() + await simple_example_with_running_js_code() + await simple_example_with_css_selector() # Advanced examples - # await extract_structured_data_using_css_extractor() + await extract_structured_data_using_css_extractor() await extract_structured_data_using_llm( "openai/gpt-4o", os.getenv("OPENAI_API_KEY") ) - # await crawl_dynamic_content_pages_method_1() - # await crawl_dynamic_content_pages_method_2() + await crawl_dynamic_content_pages_method_1() + await crawl_dynamic_content_pages_method_2() # Browser comparisons - # await crawl_custom_browser_type() - - # Performance testing - # await speed_comparison() + await crawl_custom_browser_type() # Screenshot example - # await capture_and_save_screenshot( - # "https://www.example.com", - # os.path.join(__location__, "tmp/example_screenshot.jpg") - # ) + await capture_and_save_screenshot( + "https://www.example.com", + os.path.join(__location__, "tmp/example_screenshot.jpg") + ) if __name__ == "__main__": diff --git a/tests/20241401/test_cache_context.py b/tests/20241401/test_cache_context.py new file mode 100644 index 00000000..0f42f9fd --- /dev/null +++ b/tests/20241401/test_cache_context.py @@ -0,0 +1,85 @@ +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from playwright.async_api import Page, BrowserContext + +async def test_reuse_context_by_config(): + # We will store each context ID in these maps to confirm reuse + context_ids_for_A = [] + context_ids_for_B = [] + + # Create a small hook to track context creation + async def on_page_context_created(page: Page, context: BrowserContext, config: CrawlerRunConfig, **kwargs): + c_id = id(context) + print(f"[HOOK] on_page_context_created - Context ID: {c_id}") + # Distinguish which config we used by checking a custom hook param + config_label = config.shared_data.get("config_label", "unknown") + if config_label == "A": + context_ids_for_A.append(c_id) + elif config_label == "B": + context_ids_for_B.append(c_id) + return page + + # Browser config - Headless, verbose so we see logs + browser_config = BrowserConfig(headless=True, verbose=True) + + # Two crawler run configs that differ (for example, text_mode): + configA = CrawlerRunConfig( + only_text=True, + cache_mode=CacheMode.BYPASS, + wait_until="domcontentloaded", + shared_data = { + "config_label" : "A" + } + ) + configB = CrawlerRunConfig( + only_text=False, + cache_mode=CacheMode.BYPASS, + wait_until="domcontentloaded", + shared_data = { + "config_label" : "B" + } + ) + + # Create the crawler + crawler = AsyncWebCrawler(config=browser_config) + + # Attach our custom hook + # Note: "on_page_context_created" will be called each time a new context+page is generated + crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) + + # Start the crawler (launches the browser) + await crawler.start() + + # For demonstration, we’ll crawl a benign site multiple times with each config + test_url = "https://example.com" + print("\n--- Crawling with config A (text_mode=True) ---") + for _ in range(2): + # Pass an extra kwarg to the hook so we know which config is being used + await crawler.arun(test_url, config=configA) + + print("\n--- Crawling with config B (text_mode=False) ---") + for _ in range(2): + await crawler.arun(test_url, config=configB) + + # Close the crawler (shuts down the browser, closes contexts) + await crawler.close() + + # Validate and show the results + print("\n=== RESULTS ===") + print(f"Config A context IDs: {context_ids_for_A}") + print(f"Config B context IDs: {context_ids_for_B}") + if len(set(context_ids_for_A)) == 1: + print("✅ All config A crawls used the SAME BrowserContext.") + else: + print("❌ Config A crawls created multiple contexts unexpectedly.") + if len(set(context_ids_for_B)) == 1: + print("✅ All config B crawls used the SAME BrowserContext.") + else: + print("❌ Config B crawls created multiple contexts unexpectedly.") + if set(context_ids_for_A).isdisjoint(context_ids_for_B): + print("✅ Config A context is different from Config B context.") + else: + print("❌ A and B ended up sharing the same context somehow!") + +if __name__ == "__main__": + asyncio.run(test_reuse_context_by_config())