fix(browser_manager): serialize new_page on persistent context to avoid races ref #1198
- Add _page_lock and guarded creation; handle empty context.pages safely - Prevents BrowserContext.new_page “Target page/context closed” during concurrent arun_many
This commit is contained in:
@@ -608,6 +608,11 @@ class BrowserManager:
|
|||||||
self.contexts_by_config = {}
|
self.contexts_by_config = {}
|
||||||
self._contexts_lock = asyncio.Lock()
|
self._contexts_lock = asyncio.Lock()
|
||||||
|
|
||||||
|
# Serialize context.new_page() across concurrent tasks to avoid races
|
||||||
|
# when using a shared persistent context (context.pages may be empty
|
||||||
|
# for all racers). Prevents 'Target page/context closed' errors.
|
||||||
|
self._page_lock = asyncio.Lock()
|
||||||
|
|
||||||
# Stealth-related attributes
|
# Stealth-related attributes
|
||||||
self._stealth_instance = None
|
self._stealth_instance = None
|
||||||
self._stealth_cm = None
|
self._stealth_cm = None
|
||||||
@@ -1027,13 +1032,26 @@ class BrowserManager:
|
|||||||
context = await self.create_browser_context(crawlerRunConfig)
|
context = await self.create_browser_context(crawlerRunConfig)
|
||||||
ctx = self.default_context # default context, one window only
|
ctx = self.default_context # default context, one window only
|
||||||
ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
|
ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
|
||||||
page = await ctx.new_page()
|
# Avoid concurrent new_page on shared persistent context
|
||||||
|
# See GH-1198: context.pages can be empty under races
|
||||||
|
async with self._page_lock:
|
||||||
|
page = await ctx.new_page()
|
||||||
else:
|
else:
|
||||||
context = self.default_context
|
context = self.default_context
|
||||||
pages = context.pages
|
pages = context.pages
|
||||||
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||||||
if not page:
|
if not page:
|
||||||
page = context.pages[0] # await context.new_page()
|
if pages:
|
||||||
|
page = pages[0]
|
||||||
|
else:
|
||||||
|
# Double-check under lock to avoid TOCTOU and ensure only
|
||||||
|
# one task calls new_page when pages=[] concurrently
|
||||||
|
async with self._page_lock:
|
||||||
|
pages = context.pages
|
||||||
|
if pages:
|
||||||
|
page = pages[0]
|
||||||
|
else:
|
||||||
|
page = await context.new_page()
|
||||||
else:
|
else:
|
||||||
# Otherwise, check if we have an existing context for this config
|
# Otherwise, check if we have an existing context for this config
|
||||||
config_signature = self._make_config_signature(crawlerRunConfig)
|
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||||
|
|||||||
43
tests/general/test_persistent_context.py
Normal file
43
tests/general/test_persistent_context.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
# Simple concurrency test for persistent context page creation
|
||||||
|
# Usage: python scripts/test_persistent_context.py
|
||||||
|
|
||||||
|
URLS = [
|
||||||
|
# "https://example.com",
|
||||||
|
"https://httpbin.org/html",
|
||||||
|
"https://www.python.org/",
|
||||||
|
"https://www.rust-lang.org/",
|
||||||
|
]
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
profile_dir = os.path.join(os.path.expanduser("~"), ".crawl4ai", "profiles", "test-persistent-profile")
|
||||||
|
os.makedirs(profile_dir, exist_ok=True)
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_type="chromium",
|
||||||
|
headless=True,
|
||||||
|
use_persistent_context=True,
|
||||||
|
user_data_dir=profile_dir,
|
||||||
|
use_managed_browser=True,
|
||||||
|
verbose=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
stream=False,
|
||||||
|
verbose=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
results = await crawler.arun_many(URLS, config=run_cfg)
|
||||||
|
for r in results:
|
||||||
|
print(r.url, r.success, len(r.markdown.raw_markdown) if r.markdown else 0)
|
||||||
|
# r = await crawler.arun(url=URLS[0], config=run_cfg)
|
||||||
|
# print(r.url, r.success, len(r.markdown.raw_markdown) if r.markdown else 0)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user