Add browser_context_id and target_id parameters to BrowserConfig

Enable Crawl4AI to connect to pre-created CDP browser contexts, which is
essential for cloud browser services that pre-create isolated contexts.

Changes:
- Add browser_context_id and target_id parameters to BrowserConfig
- Update from_kwargs() and to_dict() methods
- Modify BrowserManager.start() to use existing context when provided
- Add _get_page_by_target_id() helper method
- Update get_page() to handle pre-existing targets
- Add test for browser_context_id functionality

This enables cloud services to:
1. Create isolated CDP contexts before Crawl4AI connects
2. Pass context/target IDs to BrowserConfig
3. Have Crawl4AI reuse existing contexts instead of creating new ones
This commit is contained in:
unclecode
2025-12-13 02:42:48 +00:00
parent 306ddcbf3d
commit 8ae908bede
3 changed files with 602 additions and 4 deletions

View File

@@ -373,6 +373,13 @@ class BrowserConfig:
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
advanced manipulation. Default: False.
cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
browser_context_id (str or None): Pre-existing CDP browser context ID to use. When provided along with
cdp_url, the crawler will reuse this context instead of creating a new one.
Useful for cloud browser services that pre-create isolated contexts.
Default: None.
target_id (str or None): Pre-existing CDP target ID (page) to use. When provided along with
browser_context_id, the crawler will reuse this target instead of creating
a new page. Default: None.
debugging_port (int): Port for the browser debugging protocol. Default: 9222.
use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
Automatically sets use_managed_browser=True. Default: False.
@@ -427,6 +434,8 @@ class BrowserConfig:
browser_mode: str = "dedicated",
use_managed_browser: bool = False,
cdp_url: str = None,
browser_context_id: str = None,
target_id: str = None,
use_persistent_context: bool = False,
user_data_dir: str = None,
chrome_channel: str = "chromium",
@@ -462,10 +471,12 @@ class BrowserConfig:
):
self.browser_type = browser_type
self.headless = headless
self.headless = headless
self.browser_mode = browser_mode
self.use_managed_browser = use_managed_browser
self.cdp_url = cdp_url
self.browser_context_id = browser_context_id
self.target_id = target_id
self.use_persistent_context = use_persistent_context
self.user_data_dir = user_data_dir
self.chrome_channel = chrome_channel or self.browser_type or "chromium"
@@ -561,6 +572,8 @@ class BrowserConfig:
browser_mode=kwargs.get("browser_mode", "dedicated"),
use_managed_browser=kwargs.get("use_managed_browser", False),
cdp_url=kwargs.get("cdp_url"),
browser_context_id=kwargs.get("browser_context_id"),
target_id=kwargs.get("target_id"),
use_persistent_context=kwargs.get("use_persistent_context", False),
user_data_dir=kwargs.get("user_data_dir"),
chrome_channel=kwargs.get("chrome_channel", "chromium"),
@@ -598,6 +611,8 @@ class BrowserConfig:
"browser_mode": self.browser_mode,
"use_managed_browser": self.use_managed_browser,
"cdp_url": self.cdp_url,
"browser_context_id": self.browser_context_id,
"target_id": self.target_id,
"use_persistent_context": self.use_persistent_context,
"user_data_dir": self.user_data_dir,
"chrome_channel": self.chrome_channel,

View File

@@ -661,14 +661,44 @@ class BrowserManager:
if self.config.cdp_url or self.config.use_managed_browser:
self.config.use_managed_browser = True
cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
# Add CDP endpoint verification before connecting
if not await self._verify_cdp_ready(cdp_url):
raise Exception(f"CDP endpoint at {cdp_url} is not ready after startup")
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
contexts = self.browser.contexts
if contexts:
# If browser_context_id is provided, we're using a pre-created context
if self.config.browser_context_id:
if self.logger:
self.logger.debug(
f"Using pre-existing browser context: {self.config.browser_context_id}",
tag="BROWSER"
)
# When connecting to a pre-created context, it should be in contexts
if contexts:
self.default_context = contexts[0]
if self.logger:
self.logger.debug(
f"Found {len(contexts)} existing context(s), using first one",
tag="BROWSER"
)
else:
# Context was created but not yet visible - wait a bit
await asyncio.sleep(0.2)
contexts = self.browser.contexts
if contexts:
self.default_context = contexts[0]
else:
# Still no contexts - this shouldn't happen with pre-created context
if self.logger:
self.logger.warning(
"Pre-created context not found, creating new one",
tag="BROWSER"
)
self.default_context = await self.create_browser_context()
elif contexts:
self.default_context = contexts[0]
else:
self.default_context = await self.create_browser_context()
@@ -1042,6 +1072,62 @@ class BrowserManager:
params={"error": str(e)}
)
async def _get_page_by_target_id(self, context: BrowserContext, target_id: str):
"""
Get an existing page by its CDP target ID.
This is used when connecting to a pre-created browser context with an existing page.
Playwright may not immediately see targets created via raw CDP commands, so we
use CDP to get all targets and find the matching one.
Args:
context: The browser context to search in
target_id: The CDP target ID to find
Returns:
Page object if found, None otherwise
"""
try:
# First check if Playwright already sees the page
for page in context.pages:
# Playwright's internal target ID might match
if hasattr(page, '_impl_obj') and hasattr(page._impl_obj, '_target_id'):
if page._impl_obj._target_id == target_id:
return page
# If not found, try using CDP to get targets
if hasattr(self.browser, '_impl_obj') and hasattr(self.browser._impl_obj, '_connection'):
cdp_session = await context.new_cdp_session(context.pages[0] if context.pages else None)
if cdp_session:
try:
result = await cdp_session.send("Target.getTargets")
targets = result.get("targetInfos", [])
for target in targets:
if target.get("targetId") == target_id:
# Found the target - if it's a page type, we can use it
if target.get("type") == "page":
# The page exists, let Playwright discover it
await asyncio.sleep(0.1)
# Refresh pages list
if context.pages:
return context.pages[0]
finally:
await cdp_session.detach()
# Fallback: if there are any pages now, return the first one
if context.pages:
return context.pages[0]
return None
except Exception as e:
if self.logger:
self.logger.warning(
message="Failed to get page by target ID: {error}",
tag="BROWSER",
params={"error": str(e)}
)
return None
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
"""
Get a page for the given session ID, creating a new one if needed.
@@ -1086,6 +1172,14 @@ class BrowserManager:
pages = context.pages
if pages:
page = pages[0]
elif self.config.browser_context_id and self.config.target_id:
# Pre-existing context/target provided - use CDP to get the page
# This handles the case where Playwright doesn't see the target yet
page = await self._get_page_by_target_id(context, self.config.target_id)
if not page:
# Fallback: create new page in existing context
page = await context.new_page()
await self._apply_stealth_to_page(page)
else:
page = await context.new_page()
await self._apply_stealth_to_page(page)