diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 70ed20e4..8fed970c 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -608,6 +608,11 @@ class BrowserManager: self.contexts_by_config = {} self._contexts_lock = asyncio.Lock() + # Serialize context.new_page() across concurrent tasks to avoid races + # when using a shared persistent context (context.pages may be empty + # for all racers). Prevents 'Target page/context closed' errors. + self._page_lock = asyncio.Lock() + # Stealth-related attributes self._stealth_instance = None self._stealth_cm = None @@ -1027,13 +1032,26 @@ class BrowserManager: context = await self.create_browser_context(crawlerRunConfig) ctx = self.default_context # default context, one window only ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config) - page = await ctx.new_page() + # Avoid concurrent new_page on shared persistent context + # See GH-1198: context.pages can be empty under races + async with self._page_lock: + page = await ctx.new_page() else: context = self.default_context pages = context.pages page = next((p for p in pages if p.url == crawlerRunConfig.url), None) if not page: - page = context.pages[0] # await context.new_page() + if pages: + page = pages[0] + else: + # Double-check under lock to avoid TOCTOU and ensure only + # one task calls new_page when pages=[] concurrently + async with self._page_lock: + pages = context.pages + if pages: + page = pages[0] + else: + page = await context.new_page() else: # Otherwise, check if we have an existing context for this config config_signature = self._make_config_signature(crawlerRunConfig) diff --git a/tests/general/test_persistent_context.py b/tests/general/test_persistent_context.py new file mode 100644 index 00000000..48c01bff --- /dev/null +++ b/tests/general/test_persistent_context.py @@ -0,0 +1,43 @@ +import asyncio +import os +from crawl4ai.async_webcrawler import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode + +# Simple concurrency test for persistent context page creation +# Usage: python scripts/test_persistent_context.py + +URLS = [ + # "https://example.com", + "https://httpbin.org/html", + "https://www.python.org/", + "https://www.rust-lang.org/", +] + +async def main(): + profile_dir = os.path.join(os.path.expanduser("~"), ".crawl4ai", "profiles", "test-persistent-profile") + os.makedirs(profile_dir, exist_ok=True) + + browser_config = BrowserConfig( + browser_type="chromium", + headless=True, + use_persistent_context=True, + user_data_dir=profile_dir, + use_managed_browser=True, + verbose=True, + ) + + run_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + stream=False, + verbose=True, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + results = await crawler.arun_many(URLS, config=run_cfg) + for r in results: + print(r.url, r.success, len(r.markdown.raw_markdown) if r.markdown else 0) + # r = await crawler.arun(url=URLS[0], config=run_cfg) + # print(r.url, r.success, len(r.markdown.raw_markdown) if r.markdown else 0) + +if __name__ == "__main__": + asyncio.run(main())