From 69a77222efe976b4fb9c3b2074817e858a6d7248 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 24 Jan 2025 15:53:47 +0800 Subject: [PATCH] feat(browser): add CDP URL configuration support Add support for direct CDP URL configuration in BrowserConfig and ManagedBrowser classes. This allows connecting to remote browser instances using custom CDP endpoints instead of always launching a local browser. - Added cdp_url parameter to BrowserConfig - Added cdp_url support in ManagedBrowser.start() method - Updated documentation for new parameters --- crawl4ai/async_configs.py | 6 ++++++ crawl4ai/async_crawler_strategy.py | 13 +++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index c1404026..d0a9b9e1 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -32,6 +32,7 @@ class BrowserConfig: Default: True. use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing advanced manipulation. Default: False. + cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/". debugging_port (int): Port for the browser debugging protocol. Default: 9222. use_persistent_context (bool): Use a persistent browser context (like a persistent profile). Automatically sets use_managed_browser=True. Default: False. @@ -80,6 +81,7 @@ class BrowserConfig: browser_type: str = "chromium", headless: bool = True, use_managed_browser: bool = False, + cdp_url: str = None, use_persistent_context: bool = False, user_data_dir: str = None, chrome_channel: str = "chromium", @@ -107,10 +109,12 @@ class BrowserConfig: light_mode: bool = False, extra_args: list = None, debugging_port: int = 9222, + host: str = "localhost", ): self.browser_type = browser_type self.headless = headless self.use_managed_browser = use_managed_browser + self.cdp_url = cdp_url self.use_persistent_context = use_persistent_context self.user_data_dir = user_data_dir self.chrome_channel = chrome_channel or self.browser_type or "chromium" @@ -162,6 +166,7 @@ class BrowserConfig: browser_type=kwargs.get("browser_type", "chromium"), headless=kwargs.get("headless", True), use_managed_browser=kwargs.get("use_managed_browser", False), + cdp_url=kwargs.get("cdp_url"), use_persistent_context=kwargs.get("use_persistent_context", False), user_data_dir=kwargs.get("user_data_dir"), chrome_channel=kwargs.get("chrome_channel", "chromium"), @@ -194,6 +199,7 @@ class BrowserConfig: "browser_type": self.browser_type, "headless": self.headless, "use_managed_browser": self.use_managed_browser, + "cdp_url": self.cdp_url, "use_persistent_context": self.use_persistent_context, "user_data_dir": self.user_data_dir, "chrome_channel": self.chrome_channel, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 738dfb51..b11796e0 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -102,6 +102,7 @@ class ManagedBrowser: logger=None, host: str = "localhost", debugging_port: int = 9222, + cdp_url: Optional[str] = None, ): """ Initialize the ManagedBrowser instance. @@ -116,6 +117,7 @@ class ManagedBrowser: logger (logging.Logger): Logger instance for logging messages. Default: None. host (str): Host for debugging the browser. Default: "localhost". debugging_port (int): Port for debugging the browser. Default: 9222. + cdp_url (str or None): CDP URL to connect to the browser. Default: None. """ self.browser_type = browser_type self.user_data_dir = user_data_dir @@ -129,9 +131,16 @@ class ManagedBrowser: async def start(self) -> str: """ - Starts the browser process and returns the CDP endpoint URL. - If user_data_dir is not provided, creates a temporary directory. + Starts the browser process or returns CDP endpoint URL. + If cdp_url is provided, returns it directly. + If user_data_dir is not provided for local browser, creates a temporary directory. + + Returns: + str: CDP endpoint URL """ + # If CDP URL provided, just return it + if self.cdp_url: + return self.cdp_url # Create temp dir if needed if not self.user_data_dir: