feat(browser): improve browser context management and add shared data support

Add shared_data parameter to CrawlerRunConfig to allow data sharing between hooks. Implement browser context reuse based on config signatures to improve memory usage. Fix Firefox/Webkit channel settings. Add config parameter to hook callbacks for better context access. Remove debug print statements. BREAKING CHANGE: Hook callback signatures now include config parameter
2025-01-19 17:12:03 +08:00
parent e363234172
commit 1221be30a3
4 changed files with 181 additions and 34 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -112,6 +112,9 @@ class BrowserConfig:
        self.user_data_dir = user_data_dir
        self.chrome_channel = chrome_channel or self.browser_type or "chromium"
        self.channel = channel or self.browser_type or "chromium"
        if self.browser_type in ["firefox", "webkit"]:
            self.channel = ""
            self.chrome_channel = ""
        self.proxy = proxy
        self.proxy_config = proxy_config
        self.viewport_width = viewport_width
@@ -239,6 +242,8 @@ class CrawlerRunConfig:
                              Default: False.
        no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
                               Default: False.
        shared_data (dict or None): Shared data to be passed between hooks.
                                     Default: None.
        # Page Navigation and Timing Parameters
        wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
@@ -344,6 +349,7 @@ class CrawlerRunConfig:
        disable_cache: bool = False,
        no_cache_read: bool = False,
        no_cache_write: bool = False,
        shared_data: dict = None,
        # Page Navigation and Timing Parameters
        wait_until: str = "domcontentloaded",
        page_timeout: int = PAGE_TIMEOUT,
@@ -411,6 +417,7 @@ class CrawlerRunConfig:
        self.disable_cache = disable_cache
        self.no_cache_read = no_cache_read
        self.no_cache_write = no_cache_write
        self.shared_data = shared_data
        # Page Navigation and Timing Parameters
        self.wait_until = wait_until
@@ -501,6 +508,7 @@ class CrawlerRunConfig:
            disable_cache=kwargs.get("disable_cache", False),
            no_cache_read=kwargs.get("no_cache_read", False),
            no_cache_write=kwargs.get("no_cache_write", False),
            shared_data=kwargs.get("shared_data", None),
            # Page Navigation and Timing Parameters
            wait_until=kwargs.get("wait_until", "domcontentloaded"),
            page_timeout=kwargs.get("page_timeout", 60000),
@@ -574,6 +582,8 @@ class CrawlerRunConfig:
            "disable_cache": self.disable_cache,
            "no_cache_read": self.no_cache_read,
            "no_cache_write": self.no_cache_write,
            "shared_data": self.shared_data,
            # Page Navigation and Timing Parameters 
            "wait_until": self.wait_until,
            "page_timeout": self.page_timeout,
            "wait_for": self.wait_for,
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -326,6 +326,10 @@ class BrowserManager:
        self.sessions = {}
        self.session_ttl = 1800  # 30 minutes
        # Keep track of contexts by a "config signature," so each unique config reuses a single context
        self.contexts_by_config = {}
        self._contexts_lock = asyncio.Lock() 
        # Initialize ManagedBrowser if needed
        if self.config.use_managed_browser:
            self.managed_browser = ManagedBrowser(
@@ -642,7 +646,38 @@ class BrowserManager:
                await context.route(f"**/*.{ext}", lambda route: route.abort())
        return context
-    # async def get_page(self, session_id: Optional[str], user_agent: str):
+    def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
        """
        Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
        then returns a hash of the sorted JSON. This yields a stable signature
        that identifies configurations requiring a unique browser context.
        """
        import json, hashlib
        config_dict = crawlerRunConfig.__dict__.copy()
        # Exclude items that do not affect browser-level setup.
        # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
        ephemeral_keys = [
            "session_id",
            "js_code",
            "scraping_strategy",
            "extraction_strategy",
            "chunking_strategy",
            "cache_mode",
            "content_filter",
            "semaphore_count",
            "url"
        ]
        for key in ephemeral_keys:
            if key in config_dict:
                del config_dict[key]
        # Convert to canonical JSON string
        signature_json = json.dumps(config_dict, sort_keys=True, default=str)
        # Hash the JSON so we get a compact, unique string
        signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
        return signature_hash
    async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
        """
        Get a page for the given session ID, creating a new one if needed.
@@ -651,24 +686,38 @@ class BrowserManager:
            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
        Returns:
-            Page: The page object for the given session ID.
+            (page, context): The Page and its BrowserContext
            BrowserContext: The browser context for the given session ID.
        """
        self._cleanup_expired_sessions()
        # If a session_id is provided and we already have it, reuse that page + context
        if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
            context, page, _ = self.sessions[crawlerRunConfig.session_id]
            # Update last-used timestamp
            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
            return page, context
        # If using a managed browser, just grab the shared default_context
        if self.config.use_managed_browser:
            context = self.default_context
            page = await context.new_page()
        else:
-            context = await self.create_browser_context()
+            # Otherwise, check if we have an existing context for this config
-            await self.setup_context(context, crawlerRunConfig)
+            config_signature = self._make_config_signature(crawlerRunConfig)
            async with self._contexts_lock:
                if config_signature in self.contexts_by_config:
                    context = self.contexts_by_config[config_signature]
                else:
                    # Create and setup a new context
                    context = await self.create_browser_context()
                    await self.setup_context(context, crawlerRunConfig)
                    self.contexts_by_config[config_signature] = context
            # Create a new page from the chosen context
            page = await context.new_page()
        # If a session_id is specified, store this session so we can reuse later
        if crawlerRunConfig.session_id:
            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
@@ -708,6 +757,18 @@ class BrowserManager:
        for session_id in session_ids:
            await self.kill_session(session_id)
        # Now close all contexts we created. This reclaims memory from ephemeral contexts.
        for ctx in self.contexts_by_config.values():
            try:
                await ctx.close()
            except Exception as e:
                self.logger.error(
                    message="Error closing context: {error}",
                    tag="ERROR",
                    params={"error": str(e)}
                )
        self.contexts_by_config.clear()
        if self.browser:
            await self.browser.close()
            self.browser = None
@@ -1204,7 +1265,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            await context.add_init_script(load_js_script("navigator_overrider"))
        # Call hook after page creation
-        await self.execute_hook("on_page_context_created", page, context=context)
+        await self.execute_hook("on_page_context_created", page, context=context, config=config)
        # Set up console logging if requested
        if config.log_console:
@@ -1245,7 +1306,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            # Handle page navigation and content loading
            if not config.js_only:
-                await self.execute_hook("before_goto", page, context=context, url=url)
+                await self.execute_hook("before_goto", page, context=context, url=url, config=config)
                try:
                    # Generate a unique nonce for this request
@@ -1265,7 +1326,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
                await self.execute_hook(
-                    "after_goto", page, context=context, url=url, response=response
+                    "after_goto", page, context=context, url=url, response=response, config=config
                )
                if response is None:
@@ -1439,7 +1500,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        params={"error": execution_result.get("error")},
                    )
-                await self.execute_hook("on_execution_started", page, context=context)
+                await self.execute_hook("on_execution_started", page, context=context, config=config)
            # Handle user simulation
            if config.simulate_user or config.magic:
@@ -1482,7 +1543,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                page = await self.process_iframes(page)
            # Pre-content retrieval hooks and delay
-            await self.execute_hook("before_retrieve_html", page, context=context)
+            await self.execute_hook("before_retrieve_html", page, context=context, config=config)
            if config.delay_before_return_html:
                await asyncio.sleep(config.delay_before_return_html)
@@ -1493,7 +1554,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            # Get final HTML content
            html = await page.content()
            await self.execute_hook(
-                "before_return_html", page=page, html=html, context=context
+                "before_return_html", page=page, html=html, context=context, config=config
            )
            # Handle PDF and screenshot generation
@@ -1989,10 +2050,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    t1 = time.time()
                    try:
                        await page.wait_for_load_state("domcontentloaded", timeout=5000)
                        print(
                            "DOM content loaded after script execution in",
                            time.time() - t1,
                        )
                    except Error as e:
                        self.logger.warning(
                            message="DOM content load timeout: {error}",
@@ -2099,13 +2156,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    # Wait for network idle after script execution
                    t1 = time.time()
                    await page.wait_for_load_state("domcontentloaded", timeout=5000)
-                    print(
+
                        "DOM content loaded after script execution in", time.time() - t1
                    )
                    t1 = time.time()
                    await page.wait_for_load_state("networkidle", timeout=5000)
                    print("Network idle after script execution in", time.time() - t1)
                    results.append(result if result else {"success": True})
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -231,7 +231,7 @@ async def extract_structured_data_using_css_extractor():
    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
    schema = {
        "name": "KidoCode Courses",
-        "baseSelector": "section.charge-methodology .w-tab-content > div",
+        "baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
        "fields": [
            {
                "name": "section_title",
@@ -279,6 +279,7 @@ async def extract_structured_data_using_css_extractor():
        cache_mode=CacheMode.BYPASS,
        extraction_strategy=JsonCssExtractionStrategy(schema),
        js_code=[js_click_tabs],
        delay_before_return_html=1
    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
@@ -591,29 +592,26 @@ async def speed_comparison():
 # Main execution
 async def main():
    # Basic examples
-    # await simple_crawl()
+    await simple_crawl()
-    # await simple_example_with_running_js_code()
+    await simple_example_with_running_js_code()
-    # await simple_example_with_css_selector()
+    await simple_example_with_css_selector()
    # Advanced examples
-    # await extract_structured_data_using_css_extractor()
+    await extract_structured_data_using_css_extractor()
    await extract_structured_data_using_llm(
        "openai/gpt-4o", os.getenv("OPENAI_API_KEY")
    )
-    # await crawl_dynamic_content_pages_method_1()
+    await crawl_dynamic_content_pages_method_1()
-    # await crawl_dynamic_content_pages_method_2()
+    await crawl_dynamic_content_pages_method_2()
    # Browser comparisons
-    # await crawl_custom_browser_type()
+    await crawl_custom_browser_type()
    # Performance testing
    # await speed_comparison()
    # Screenshot example
-    # await capture_and_save_screenshot(
+    await capture_and_save_screenshot(
-    #     "https://www.example.com",
+        "https://www.example.com",
-    #     os.path.join(__location__, "tmp/example_screenshot.jpg")
+        os.path.join(__location__, "tmp/example_screenshot.jpg")
-    # )
+    )
 if __name__ == "__main__":
--- a/tests/20241401/test_cache_context.py
+++ b/tests/20241401/test_cache_context.py
@@ -0,0 +1,85 @@
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from playwright.async_api import Page, BrowserContext
 async def test_reuse_context_by_config():
    # We will store each context ID in these maps to confirm reuse
    context_ids_for_A = []
    context_ids_for_B = []
    # Create a small hook to track context creation
    async def on_page_context_created(page: Page, context: BrowserContext, config: CrawlerRunConfig, **kwargs):
        c_id = id(context)
        print(f"[HOOK] on_page_context_created - Context ID: {c_id}")
        # Distinguish which config we used by checking a custom hook param
        config_label = config.shared_data.get("config_label", "unknown")
        if config_label == "A":
            context_ids_for_A.append(c_id)
        elif config_label == "B":
            context_ids_for_B.append(c_id)
        return page
    # Browser config - Headless, verbose so we see logs
    browser_config = BrowserConfig(headless=True, verbose=True)
    # Two crawler run configs that differ (for example, text_mode):
    configA = CrawlerRunConfig(
        only_text=True,
        cache_mode=CacheMode.BYPASS,
        wait_until="domcontentloaded",
        shared_data = {
            "config_label" : "A"
        }
    )
    configB = CrawlerRunConfig(
        only_text=False,
        cache_mode=CacheMode.BYPASS,
        wait_until="domcontentloaded",
        shared_data = {
            "config_label" : "B"
        }
    )
    # Create the crawler
    crawler = AsyncWebCrawler(config=browser_config)
    # Attach our custom hook
    # Note: "on_page_context_created" will be called each time a new context+page is generated
    crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
    # Start the crawler (launches the browser)
    await crawler.start()
    # For demonstration, we’ll crawl a benign site multiple times with each config
    test_url = "https://example.com"
    print("\n--- Crawling with config A (text_mode=True) ---")
    for _ in range(2):
        # Pass an extra kwarg to the hook so we know which config is being used
        await crawler.arun(test_url, config=configA)
    print("\n--- Crawling with config B (text_mode=False) ---")
    for _ in range(2):
        await crawler.arun(test_url, config=configB)
    # Close the crawler (shuts down the browser, closes contexts)
    await crawler.close()
    # Validate and show the results
    print("\n=== RESULTS ===")
    print(f"Config A context IDs: {context_ids_for_A}")
    print(f"Config B context IDs: {context_ids_for_B}")
    if len(set(context_ids_for_A)) == 1:
        print("✅ All config A crawls used the SAME BrowserContext.")
    else:
        print("❌ Config A crawls created multiple contexts unexpectedly.")
    if len(set(context_ids_for_B)) == 1:
        print("✅ All config B crawls used the SAME BrowserContext.")
    else:
        print("❌ Config B crawls created multiple contexts unexpectedly.")
    if set(context_ids_for_A).isdisjoint(context_ids_for_B):
        print("✅ Config A context is different from Config B context.")
    else:
        print("❌ A and B ended up sharing the same context somehow!")
 if __name__ == "__main__":
    asyncio.run(test_reuse_context_by_config())