diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index c1404026..d0a9b9e1 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -32,6 +32,7 @@ class BrowserConfig: Default: True. use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing advanced manipulation. Default: False. + cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/". debugging_port (int): Port for the browser debugging protocol. Default: 9222. use_persistent_context (bool): Use a persistent browser context (like a persistent profile). Automatically sets use_managed_browser=True. Default: False. @@ -80,6 +81,7 @@ class BrowserConfig: browser_type: str = "chromium", headless: bool = True, use_managed_browser: bool = False, + cdp_url: str = None, use_persistent_context: bool = False, user_data_dir: str = None, chrome_channel: str = "chromium", @@ -107,10 +109,12 @@ class BrowserConfig: light_mode: bool = False, extra_args: list = None, debugging_port: int = 9222, + host: str = "localhost", ): self.browser_type = browser_type self.headless = headless self.use_managed_browser = use_managed_browser + self.cdp_url = cdp_url self.use_persistent_context = use_persistent_context self.user_data_dir = user_data_dir self.chrome_channel = chrome_channel or self.browser_type or "chromium" @@ -162,6 +166,7 @@ class BrowserConfig: browser_type=kwargs.get("browser_type", "chromium"), headless=kwargs.get("headless", True), use_managed_browser=kwargs.get("use_managed_browser", False), + cdp_url=kwargs.get("cdp_url"), use_persistent_context=kwargs.get("use_persistent_context", False), user_data_dir=kwargs.get("user_data_dir"), chrome_channel=kwargs.get("chrome_channel", "chromium"), @@ -194,6 +199,7 @@ class BrowserConfig: "browser_type": self.browser_type, "headless": self.headless, "use_managed_browser": self.use_managed_browser, + "cdp_url": self.cdp_url, "use_persistent_context": self.use_persistent_context, "user_data_dir": self.user_data_dir, "chrome_channel": self.chrome_channel, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 738dfb51..b11796e0 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -102,6 +102,7 @@ class ManagedBrowser: logger=None, host: str = "localhost", debugging_port: int = 9222, + cdp_url: Optional[str] = None, ): """ Initialize the ManagedBrowser instance. @@ -116,6 +117,7 @@ class ManagedBrowser: logger (logging.Logger): Logger instance for logging messages. Default: None. host (str): Host for debugging the browser. Default: "localhost". debugging_port (int): Port for debugging the browser. Default: 9222. + cdp_url (str or None): CDP URL to connect to the browser. Default: None. """ self.browser_type = browser_type self.user_data_dir = user_data_dir @@ -129,9 +131,16 @@ class ManagedBrowser: async def start(self) -> str: """ - Starts the browser process and returns the CDP endpoint URL. - If user_data_dir is not provided, creates a temporary directory. + Starts the browser process or returns CDP endpoint URL. + If cdp_url is provided, returns it directly. + If user_data_dir is not provided for local browser, creates a temporary directory. + + Returns: + str: CDP endpoint URL """ + # If CDP URL provided, just return it + if self.cdp_url: + return self.cdp_url # Create temp dir if needed if not self.user_data_dir: