import asyncio import time from typing import Dict, List, Optional, Tuple import os import sys import shutil import tempfile import psutil import signal import subprocess import shlex from playwright.async_api import BrowserContext import hashlib from .js_snippet import load_js_script from .config import DOWNLOAD_PAGE_TIMEOUT from .async_configs import BrowserConfig, CrawlerRunConfig from .utils import get_chromium_path import warnings BROWSER_DISABLE_OPTIONS = [ "--disable-background-networking", "--disable-background-timer-throttling", "--disable-backgrounding-occluded-windows", "--disable-breakpad", "--disable-client-side-phishing-detection", "--disable-component-extensions-with-background-pages", "--disable-default-apps", "--disable-extensions", "--disable-features=TranslateUI", "--disable-hang-monitor", "--disable-ipc-flooding-protection", "--disable-popup-blocking", "--disable-prompt-on-repost", "--disable-sync", "--force-color-profile=srgb", "--metrics-recording-only", "--no-first-run", "--password-store=basic", "--use-mock-keychain", ] class ManagedBrowser: """ Manages the browser process and context. This class allows to connect to the browser using CDP protocol. Attributes: browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". Default: "chromium". user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a temporary directory may be used. Default: None. headless (bool): Whether to run the browser in headless mode (no visible GUI). Default: True. browser_process (subprocess.Popen): The process object for the browser. temp_dir (str): Temporary directory for user data if not provided. debugging_port (int): Port for debugging the browser. host (str): Host for debugging the browser. Methods: start(): Starts the browser process and returns the CDP endpoint URL. _get_browser_path(): Returns the browser executable path based on OS and browser type. _get_browser_args(): Returns browser-specific command line arguments. _get_user_data_dir(): Returns the user data directory path. _cleanup(): Terminates the browser process and removes the temporary directory. create_profile(): Static method to create a user profile by launching a browser for user interaction. """ @staticmethod def build_browser_flags(config: BrowserConfig) -> List[str]: """Common CLI flags for launching Chromium""" flags = [ "--disable-gpu", "--disable-gpu-compositing", "--disable-software-rasterizer", "--no-sandbox", "--disable-dev-shm-usage", "--no-first-run", "--no-default-browser-check", "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", "--ignore-certificate-errors-spki-list", "--disable-blink-features=AutomationControlled", "--window-position=400,0", "--disable-renderer-backgrounding", "--disable-ipc-flooding-protection", "--force-color-profile=srgb", "--mute-audio", "--disable-background-timer-throttling", # Memory-saving flags: disable unused Chrome features "--disable-features=OptimizationHints,MediaRouter,DialMediaRouteProvider", "--disable-component-update", "--disable-domain-reliability", ] if config.memory_saving_mode: flags.extend([ "--aggressive-cache-discard", '--js-flags=--max-old-space-size=512', ]) if config.light_mode: flags.extend(BROWSER_DISABLE_OPTIONS) if config.text_mode: flags.extend([ "--blink-settings=imagesEnabled=false", "--disable-remote-fonts", "--disable-images", "--disable-javascript", "--disable-software-rasterizer", "--disable-dev-shm-usage", ]) # proxy support if config.proxy: flags.append(f"--proxy-server={config.proxy}") elif config.proxy_config: creds = "" if config.proxy_config.username and config.proxy_config.password: creds = f"{config.proxy_config.username}:{config.proxy_config.password}@" flags.append(f"--proxy-server={creds}{config.proxy_config.server}") # dedupe return list(dict.fromkeys(flags)) browser_type: str user_data_dir: str headless: bool browser_process: subprocess.Popen temp_dir: str debugging_port: int host: str def __init__( self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger=None, host: str = "localhost", debugging_port: int = 9222, cdp_url: Optional[str] = None, browser_config: Optional[BrowserConfig] = None, ): """ Initialize the ManagedBrowser instance. Args: browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". Default: "chromium". user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a temporary directory may be used. Default: None. headless (bool): Whether to run the browser in headless mode (no visible GUI). Default: True. logger (logging.Logger): Logger instance for logging messages. Default: None. host (str): Host for debugging the browser. Default: "localhost". debugging_port (int): Port for debugging the browser. Default: 9222. cdp_url (str or None): CDP URL to connect to the browser. Default: None. browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None. """ self.browser_type = browser_config.browser_type self.user_data_dir = browser_config.user_data_dir self.headless = browser_config.headless self.browser_process = None self.temp_dir = None self.debugging_port = browser_config.debugging_port self.host = browser_config.host self.logger = logger self.shutting_down = False self.cdp_url = browser_config.cdp_url self.browser_config = browser_config async def start(self) -> str: """ Starts the browser process or returns CDP endpoint URL. If cdp_url is provided, returns it directly. If user_data_dir is not provided for local browser, creates a temporary directory. Returns: str: CDP endpoint URL """ # If CDP URL provided, just return it if self.cdp_url: return self.cdp_url # Create temp dir if needed if not self.user_data_dir: self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") self.user_data_dir = self.temp_dir # Get browser path and args based on OS and browser type # browser_path = self._get_browser_path() args = await self._get_browser_args() if self.browser_config.extra_args: args.extend(self.browser_config.extra_args) # ── make sure no old Chromium instance is owning the same port/profile ── try: if sys.platform == "win32": if psutil is None: raise RuntimeError("psutil not available, cannot clean old browser") for p in psutil.process_iter(["pid", "name", "cmdline"]): cl = " ".join(p.info.get("cmdline") or []) if ( f"--remote-debugging-port={self.debugging_port}" in cl and f"--user-data-dir={self.user_data_dir}" in cl ): p.kill() p.wait(timeout=5) else: # macOS / Linux # kill any process listening on the same debugging port pids = ( subprocess.check_output(shlex.split(f"lsof -t -i:{self.debugging_port}")) .decode() .strip() .splitlines() ) for pid in pids: try: os.kill(int(pid), signal.SIGTERM) except ProcessLookupError: pass # remove Chromium singleton locks, or new launch exits with # “Opening in existing browser session.” for f in ("SingletonLock", "SingletonSocket", "SingletonCookie"): fp = os.path.join(self.user_data_dir, f) if os.path.exists(fp): os.remove(fp) except Exception as _e: # non-fatal — we'll try to start anyway, but log what happened self.logger.warning(f"pre-launch cleanup failed: {_e}", tag="BROWSER") # Start browser process try: # Use DETACHED_PROCESS flag on Windows to fully detach the process # On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group if sys.platform == "win32": self.browser_process = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP ) else: self.browser_process = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=os.setpgrp # Start in a new process group ) # If verbose is True print args used to run the process if self.logger and self.browser_config.verbose: self.logger.debug( f"Starting browser with args: {' '.join(args)}", tag="BROWSER" ) # We'll monitor for a short time to make sure it starts properly, but won't keep monitoring await asyncio.sleep(0.5) # Give browser time to start await self._initial_startup_check() await asyncio.sleep(2) # Give browser time to start return f"http://{self.host}:{self.debugging_port}" except Exception as e: await self.cleanup() raise Exception(f"Failed to start browser: {e}") async def _initial_startup_check(self): """ Perform a quick check to make sure the browser started successfully. This only runs once at startup rather than continuously monitoring. """ if not self.browser_process: return # Check that process started without immediate termination await asyncio.sleep(0.5) if self.browser_process.poll() is not None: # Process already terminated stdout, stderr = b"", b"" try: stdout, stderr = self.browser_process.communicate(timeout=0.5) except subprocess.TimeoutExpired: pass self.logger.error( message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", tag="ERROR", params={ "code": self.browser_process.returncode, "stdout": stdout.decode() if stdout else "", "stderr": stderr.decode() if stderr else "", }, ) async def _monitor_browser_process(self): """ Monitor the browser process for unexpected termination. How it works: 1. Read stdout and stderr from the browser process. 2. If the process has terminated, log the error message and terminate the browser. 3. If the shutting_down flag is set, log the normal termination message. 4. If any other error occurs, log the error message. Note: This method should be called in a separate task to avoid blocking the main event loop. This is DEPRECATED and should not be used for builtin browsers that need to outlive the Python process. """ if self.browser_process: try: stdout, stderr = await asyncio.gather( asyncio.to_thread(self.browser_process.stdout.read), asyncio.to_thread(self.browser_process.stderr.read), ) # Check shutting_down flag BEFORE logging anything if self.browser_process.poll() is not None: if not self.shutting_down: self.logger.error( message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", tag="ERROR", params={ "code": self.browser_process.returncode, "stdout": stdout.decode(), "stderr": stderr.decode(), }, ) await self.cleanup() else: self.logger.info( message="Browser process terminated normally | Code: {code}", tag="INFO", params={"code": self.browser_process.returncode}, ) except Exception as e: if not self.shutting_down: self.logger.error( message="Error monitoring browser process: {error}", tag="ERROR", params={"error": str(e)}, ) def _get_browser_path_WIP(self) -> str: """Returns the browser executable path based on OS and browser type""" if sys.platform == "darwin": # macOS paths = { "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", "webkit": "/Applications/Safari.app/Contents/MacOS/Safari", } elif sys.platform == "win32": # Windows paths = { "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", "webkit": None, # WebKit not supported on Windows } else: # Linux paths = { "chromium": "google-chrome", "firefox": "firefox", "webkit": None, # WebKit not supported on Linux } return paths.get(self.browser_type) async def _get_browser_path(self) -> str: browser_path = await get_chromium_path(self.browser_type) return browser_path async def _get_browser_args(self) -> List[str]: """Returns full CLI args for launching the browser""" base = [await self._get_browser_path()] if self.browser_type == "chromium": flags = [ f"--remote-debugging-port={self.debugging_port}", f"--user-data-dir={self.user_data_dir}", ] if self.headless: flags.append("--headless=new") # Add viewport flag if specified in config if self.browser_config.viewport_height and self.browser_config.viewport_width: flags.append(f"--window-size={self.browser_config.viewport_width},{self.browser_config.viewport_height}") # merge common launch flags flags.extend(self.build_browser_flags(self.browser_config)) elif self.browser_type == "firefox": flags = [ "--remote-debugging-port", str(self.debugging_port), "--profile", self.user_data_dir, ] if self.headless: flags.append("--headless") else: raise NotImplementedError(f"Browser type {self.browser_type} not supported") return base + flags async def cleanup(self): """Cleanup browser process and temporary directory""" # Set shutting_down flag BEFORE any termination actions self.shutting_down = True if self.browser_process: try: # For builtin browsers that should persist, we should check if it's a detached process # Only terminate if we have proper control over the process if not self.browser_process.poll(): # Process is still running self.browser_process.terminate() # Wait for process to end gracefully for _ in range(10): # 10 attempts, 100ms each if self.browser_process.poll() is not None: break await asyncio.sleep(0.1) # Force kill if still running if self.browser_process.poll() is None: if sys.platform == "win32": # On Windows we might need taskkill for detached processes try: subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)]) except Exception: self.browser_process.kill() else: self.browser_process.kill() await asyncio.sleep(0.1) # Brief wait for kill to take effect except Exception as e: self.logger.error( message="Error terminating browser: {error}", tag="ERROR", params={"error": str(e)}, ) if self.temp_dir and os.path.exists(self.temp_dir): try: shutil.rmtree(self.temp_dir) except Exception as e: self.logger.error( message="Error removing temporary directory: {error}", tag="ERROR", params={"error": str(e)}, ) # These methods have been moved to BrowserProfiler class @staticmethod async def create_profile(browser_config=None, profile_name=None, logger=None): """ This method has been moved to the BrowserProfiler class. Creates a browser profile by launching a browser for interactive user setup and waits until the user closes it. The profile is stored in a directory that can be used later with BrowserConfig.user_data_dir. Please use BrowserProfiler.create_profile() instead. Example: ```python from crawl4ai.browser_profiler import BrowserProfiler profiler = BrowserProfiler() profile_path = await profiler.create_profile(profile_name="my-login-profile") ``` """ from .browser_profiler import BrowserProfiler # Create a BrowserProfiler instance and delegate to it profiler = BrowserProfiler(logger=logger) return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config) @staticmethod def list_profiles(): """ This method has been moved to the BrowserProfiler class. Lists all available browser profiles in the Crawl4AI profiles directory. Please use BrowserProfiler.list_profiles() instead. Example: ```python from crawl4ai.browser_profiler import BrowserProfiler profiler = BrowserProfiler() profiles = profiler.list_profiles() ``` """ from .browser_profiler import BrowserProfiler # Create a BrowserProfiler instance and delegate to it profiler = BrowserProfiler() return profiler.list_profiles() @staticmethod def delete_profile(profile_name_or_path): """ This method has been moved to the BrowserProfiler class. Delete a browser profile by name or path. Please use BrowserProfiler.delete_profile() instead. Example: ```python from crawl4ai.browser_profiler import BrowserProfiler profiler = BrowserProfiler() success = profiler.delete_profile("my-profile") ``` """ from .browser_profiler import BrowserProfiler # Create a BrowserProfiler instance and delegate to it profiler = BrowserProfiler() return profiler.delete_profile(profile_name_or_path) async def clone_runtime_state( src: BrowserContext, dst: BrowserContext, crawlerRunConfig: CrawlerRunConfig | None = None, browserConfig: BrowserConfig | None = None, ) -> None: """ Bring everything that *can* be changed at runtime from `src` → `dst`. 1. Cookies 2. localStorage (and sessionStorage, same API) 3. Extra headers, permissions, geolocation if supplied in configs """ # ── 1. cookies ──────────────────────────────────────────────────────────── cookies = await src.cookies() if cookies: await dst.add_cookies(cookies) # ── 2. localStorage / sessionStorage ────────────────────────────────────── state = await src.storage_state() for origin in state.get("origins", []): url = origin["origin"] kvs = origin.get("localStorage", []) if not kvs: continue page = dst.pages[0] if dst.pages else await dst.new_page() await page.goto(url, wait_until="domcontentloaded") for k, v in kvs: await page.evaluate("(k,v)=>localStorage.setItem(k,v)", k, v) # ── 3. runtime-mutable extras from configs ──────────────────────────────── # headers if browserConfig and browserConfig.headers: await dst.set_extra_http_headers(browserConfig.headers) # geolocation if crawlerRunConfig and crawlerRunConfig.geolocation: await dst.grant_permissions(["geolocation"]) await dst.set_geolocation( { "latitude": crawlerRunConfig.geolocation.latitude, "longitude": crawlerRunConfig.geolocation.longitude, "accuracy": crawlerRunConfig.geolocation.accuracy, } ) return dst class _CDPConnectionCache: """ Class-level cache for Playwright + CDP browser connections. When enabled via BrowserConfig(cache_cdp_connection=True), multiple BrowserManager instances connecting to the same cdp_url will share a single Playwright subprocess and CDP WebSocket. Reference-counted; the connection is closed when the last user releases it. """ _cache: Dict[str, Tuple] = {} # cdp_url -> (playwright, browser, ref_count) _lock: Optional[asyncio.Lock] = None # lazy-init to avoid event loop issues _lock_loop: Optional[asyncio.AbstractEventLoop] = None @classmethod def _get_lock(cls) -> asyncio.Lock: loop = asyncio.get_running_loop() if cls._lock is None or cls._lock_loop is not loop: cls._lock = asyncio.Lock() cls._lock_loop = loop return cls._lock @classmethod async def acquire(cls, cdp_url: str, use_undetected: bool = False): """Get or create a cached (playwright, browser) for this cdp_url.""" async with cls._get_lock(): if cdp_url in cls._cache: pw, browser, count = cls._cache[cdp_url] if browser.is_connected(): cls._cache[cdp_url] = (pw, browser, count + 1) return pw, browser # Stale connection — clean up and fall through to create new try: await pw.stop() except Exception: pass del cls._cache[cdp_url] # Create new connection if use_undetected: from patchright.async_api import async_playwright else: from playwright.async_api import async_playwright pw = await async_playwright().start() browser = await pw.chromium.connect_over_cdp(cdp_url) cls._cache[cdp_url] = (pw, browser, 1) return pw, browser @classmethod async def release(cls, cdp_url: str): """Decrement ref count; close connection when last user releases.""" async with cls._get_lock(): if cdp_url not in cls._cache: return pw, browser, count = cls._cache[cdp_url] if count <= 1: try: await browser.close() except Exception: pass try: await pw.stop() except Exception: pass del cls._cache[cdp_url] else: cls._cache[cdp_url] = (pw, browser, count - 1) @classmethod async def close_all(cls): """Force-close all cached connections. Call on application shutdown.""" async with cls._get_lock(): for cdp_url in list(cls._cache.keys()): pw, browser, _ = cls._cache[cdp_url] try: await browser.close() except Exception: pass try: await pw.stop() except Exception: pass cls._cache.clear() class BrowserManager: """ Manages the browser instance and context. Attributes: config (BrowserConfig): Configuration object containing all browser settings logger: Logger instance for recording events and errors browser (Browser): The browser instance default_context (BrowserContext): The default browser context managed_browser (ManagedBrowser): The managed browser instance playwright (Playwright): The Playwright instance sessions (dict): Dictionary to store session information session_ttl (int): Session timeout in seconds """ _playwright_instance = None # Class-level tracking of pages in use, keyed by browser endpoint (CDP URL or instance id) # This ensures multiple BrowserManager instances connecting to the same browser # share the same page tracking, preventing race conditions. _global_pages_in_use: dict = {} # endpoint_key -> set of pages _global_pages_lock: asyncio.Lock = None # Initialized lazily @classmethod def _get_global_lock(cls) -> asyncio.Lock: """Get or create the global pages lock (lazy initialization for async context).""" if cls._global_pages_lock is None: cls._global_pages_lock = asyncio.Lock() return cls._global_pages_lock @classmethod async def get_playwright(cls, use_undetected: bool = False): if use_undetected: from patchright.async_api import async_playwright else: from playwright.async_api import async_playwright cls._playwright_instance = await async_playwright().start() return cls._playwright_instance def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: bool = False): """ Initialize the BrowserManager with a browser configuration. Args: browser_config (BrowserConfig): Configuration object containing all browser settings logger: Logger instance for recording events and errors use_undetected (bool): Whether to use undetected browser (Patchright) """ self.config: BrowserConfig = browser_config self.logger = logger self.use_undetected = use_undetected # Browser state self.browser = None self.default_context = None self.managed_browser = None self.playwright = None self._using_cached_cdp = False # Session management self.sessions = {} self.session_ttl = 1800 # 30 minutes # Keep track of contexts by a "config signature," so each unique config reuses a single context self.contexts_by_config = {} self._contexts_lock = asyncio.Lock() # Context lifecycle tracking for LRU eviction self._context_refcounts = {} # sig -> int (active crawls using this context) self._context_last_used = {} # sig -> float (monotonic timestamp for LRU) self._page_to_sig = {} # page -> sig (for decrement lookup on release) self._max_contexts = 20 # LRU eviction threshold # Serialize context.new_page() across concurrent tasks to avoid races # when using a shared persistent context (context.pages may be empty # for all racers). Prevents 'Target page/context closed' errors. self._page_lock = asyncio.Lock() # Browser endpoint key for global page tracking (set after browser starts) self._browser_endpoint_key: Optional[str] = None # Browser recycling state (version-based approach) self._pages_served = 0 self._browser_version = 1 # included in signature, bump to create new browser self._pending_cleanup = {} # old_sig -> {"browser": browser, "contexts": [...], "done": Event} self._pending_cleanup_lock = asyncio.Lock() self._max_pending_browsers = 3 # safety cap — block if too many draining self._cleanup_slot_available = asyncio.Event() self._cleanup_slot_available.set() # starts open # Stealth adapter for stealth mode self._stealth_adapter = None if self.config.enable_stealth and not self.use_undetected: from .browser_adapter import StealthAdapter self._stealth_adapter = StealthAdapter() # Initialize ManagedBrowser if needed if self.config.use_managed_browser: self.managed_browser = ManagedBrowser( browser_type=self.config.browser_type, user_data_dir=self.config.user_data_dir, headless=self.config.headless, logger=self.logger, debugging_port=self.config.debugging_port, cdp_url=self.config.cdp_url, browser_config=self.config, ) async def start(self): """ Start the browser instance and set up the default context. How it works: 1. Check if Playwright is already initialized. 2. If not, initialize Playwright. 3. If managed browser is used, start it and connect to the CDP endpoint. 4. If managed browser is not used, launch the browser and set up the default context. Note: This method should be called in a separate task to avoid blocking the main event loop. """ if self.playwright is not None: await self.close() # Use cached CDP connection if enabled and cdp_url is set if self.config.cache_cdp_connection and self.config.cdp_url: self._using_cached_cdp = True self.config.use_managed_browser = True self.playwright, self.browser = await _CDPConnectionCache.acquire( self.config.cdp_url, self.use_undetected ) else: self._using_cached_cdp = False if self.use_undetected: from patchright.async_api import async_playwright else: from playwright.async_api import async_playwright # Initialize playwright self.playwright = await async_playwright().start() if self.config.cdp_url or self.config.use_managed_browser: self.config.use_managed_browser = True if not self._using_cached_cdp: cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url # Add CDP endpoint verification before connecting if not await self._verify_cdp_ready(cdp_url): raise Exception(f"CDP endpoint at {cdp_url} is not ready after startup") self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) contexts = self.browser.contexts # If browser_context_id is provided, we're using a pre-created context if self.config.browser_context_id: if self.logger: self.logger.debug( f"Using pre-existing browser context: {self.config.browser_context_id}", tag="BROWSER" ) # When connecting to a pre-created context, it should be in contexts if contexts: self.default_context = contexts[0] if self.logger: self.logger.debug( f"Found {len(contexts)} existing context(s), using first one", tag="BROWSER" ) else: # Context was created but not yet visible - wait a bit await asyncio.sleep(0.2) contexts = self.browser.contexts if contexts: self.default_context = contexts[0] else: # Still no contexts - this shouldn't happen with pre-created context if self.logger: self.logger.warning( "Pre-created context not found, creating new one", tag="BROWSER" ) self.default_context = await self.create_browser_context() elif contexts: self.default_context = contexts[0] else: self.default_context = await self.create_browser_context() await self.setup_context(self.default_context) else: browser_args = self._build_browser_args() # Launch appropriate browser type if self.config.browser_type == "firefox": self.browser = await self.playwright.firefox.launch(**browser_args) elif self.config.browser_type == "webkit": self.browser = await self.playwright.webkit.launch(**browser_args) else: self.browser = await self.playwright.chromium.launch(**browser_args) self.default_context = self.browser # Set the browser endpoint key for global page tracking self._browser_endpoint_key = self._compute_browser_endpoint_key() # Initialize global tracking set for this endpoint if needed if self._browser_endpoint_key not in BrowserManager._global_pages_in_use: BrowserManager._global_pages_in_use[self._browser_endpoint_key] = set() def _compute_browser_endpoint_key(self) -> str: """ Compute a unique key identifying this browser connection. For CDP connections, uses the normalized CDP URL so all BrowserManager instances connecting to the same browser share page tracking. For standalone browsers, uses instance id since each is independent. Returns: str: Unique identifier for this browser connection """ # For CDP connections, use the CDP URL as the key (normalized) if self.config.cdp_url: return self._normalize_cdp_url(self.config.cdp_url) # For managed browsers, use the CDP URL/port that was assigned if self.managed_browser: # Use debugging port as the key since it uniquely identifies the browser port = getattr(self.managed_browser, 'debugging_port', None) host = getattr(self.managed_browser, 'host', 'localhost') if port: return f"cdp:http://{host}:{port}" # For standalone browsers, use instance id (no sharing needed) return f"instance:{id(self)}" def _normalize_cdp_url(self, cdp_url: str) -> str: """ Normalize a CDP URL to a canonical form for consistent tracking. Handles various formats: - http://localhost:9222 - ws://localhost:9222/devtools/browser/xxx - http://localhost:9222?browser_id=xxx Returns: str: Normalized CDP key in format "cdp:http://host:port" """ from urllib.parse import urlparse parsed = urlparse(cdp_url) host = parsed.hostname or 'localhost' port = parsed.port or 9222 return f"cdp:http://{host}:{port}" def _get_pages_in_use(self) -> set: """Get the set of pages currently in use for this browser.""" if self._browser_endpoint_key and self._browser_endpoint_key in BrowserManager._global_pages_in_use: return BrowserManager._global_pages_in_use[self._browser_endpoint_key] # Fallback: shouldn't happen, but return empty set return set() def _mark_page_in_use(self, page) -> None: """Mark a page as in use.""" if self._browser_endpoint_key: if self._browser_endpoint_key not in BrowserManager._global_pages_in_use: BrowserManager._global_pages_in_use[self._browser_endpoint_key] = set() BrowserManager._global_pages_in_use[self._browser_endpoint_key].add(page) def _release_page_from_use(self, page) -> None: """Release a page from the in-use tracking.""" if self._browser_endpoint_key and self._browser_endpoint_key in BrowserManager._global_pages_in_use: BrowserManager._global_pages_in_use[self._browser_endpoint_key].discard(page) async def _verify_cdp_ready(self, cdp_url: str) -> bool: """Verify CDP endpoint is ready with exponential backoff. Supports multiple URL formats: - HTTP URLs: http://localhost:9222 - HTTP URLs with query params: http://localhost:9222?browser_id=XXX - WebSocket URLs: ws://localhost:9222/devtools/browser/XXX """ import aiohttp from urllib.parse import urlparse, urlunparse # If WebSocket URL, Playwright handles connection directly - skip HTTP verification if cdp_url.startswith(('ws://', 'wss://')): self.logger.debug(f"WebSocket CDP URL provided, skipping HTTP verification", tag="BROWSER") return True # Parse HTTP URL and properly construct /json/version endpoint parsed = urlparse(cdp_url) # Build URL with /json/version path, preserving query params verify_url = urlunparse(( parsed.scheme, parsed.netloc, '/json/version', # Always use this path for verification '', # params parsed.query, # preserve query string '' # fragment )) self.logger.debug(f"Starting CDP verification for {verify_url}", tag="BROWSER") for attempt in range(5): try: async with aiohttp.ClientSession() as session: async with session.get(verify_url, timeout=aiohttp.ClientTimeout(total=2)) as response: if response.status == 200: self.logger.debug(f"CDP endpoint ready after {attempt + 1} attempts", tag="BROWSER") return True except Exception as e: self.logger.debug(f"CDP check attempt {attempt + 1} failed: {e}", tag="BROWSER") delay = 0.5 * (1.4 ** attempt) self.logger.debug(f"Waiting {delay:.2f}s before next CDP check...", tag="BROWSER") await asyncio.sleep(delay) self.logger.debug(f"CDP verification failed after 5 attempts", tag="BROWSER") return False def _build_browser_args(self) -> dict: """Build browser launch arguments from config.""" args = [ "--disable-gpu", "--disable-gpu-compositing", "--disable-software-rasterizer", "--no-sandbox", "--disable-dev-shm-usage", "--no-first-run", "--no-default-browser-check", "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", "--ignore-certificate-errors-spki-list", "--disable-blink-features=AutomationControlled", "--window-position=400,0", "--disable-renderer-backgrounding", "--disable-ipc-flooding-protection", "--force-color-profile=srgb", "--mute-audio", "--disable-background-timer-throttling", # Memory-saving flags: disable unused Chrome features "--disable-features=OptimizationHints,MediaRouter,DialMediaRouteProvider", "--disable-component-update", "--disable-domain-reliability", # "--single-process", f"--window-size={self.config.viewport_width},{self.config.viewport_height}", ] if self.config.memory_saving_mode: args.extend([ "--aggressive-cache-discard", '--js-flags=--max-old-space-size=512', ]) if self.config.light_mode: args.extend(BROWSER_DISABLE_OPTIONS) if self.config.text_mode: args.extend( [ "--blink-settings=imagesEnabled=false", "--disable-remote-fonts", "--disable-images", "--disable-javascript", "--disable-software-rasterizer", "--disable-dev-shm-usage", ] ) if self.config.extra_args: args.extend(self.config.extra_args) # Deduplicate args args = list(dict.fromkeys(args)) browser_args = {"headless": self.config.headless, "args": args} if self.config.chrome_channel: browser_args["channel"] = self.config.chrome_channel if self.config.accept_downloads: browser_args["downloads_path"] = self.config.downloads_path or os.path.join( os.getcwd(), "downloads" ) os.makedirs(browser_args["downloads_path"], exist_ok=True) if self.config.proxy: warnings.warn( "BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.", DeprecationWarning, ) if self.config.proxy_config: from playwright.async_api import ProxySettings proxy_settings = ProxySettings( server=self.config.proxy_config.server, username=self.config.proxy_config.username, password=self.config.proxy_config.password, ) browser_args["proxy"] = proxy_settings return browser_args async def setup_context( self, context: BrowserContext, crawlerRunConfig: CrawlerRunConfig = None, is_default=False, ): """ Set up a browser context with the configured options. How it works: 1. Set extra HTTP headers if provided. 2. Add cookies if provided. 3. Load storage state if provided. 4. Accept downloads if enabled. 5. Set default timeouts for navigation and download. 6. Set user agent if provided. 7. Set browser hints if provided. 8. Set proxy if provided. 9. Set downloads path if provided. 10. Set storage state if provided. 11. Set cache if provided. 12. Set extra HTTP headers if provided. 13. Add cookies if provided. 14. Set default timeouts for navigation and download if enabled. 15. Set user agent if provided. 16. Set browser hints if provided. Args: context (BrowserContext): The browser context to set up crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings is_default (bool): Flag indicating if this is the default context Returns: None """ if self.config.headers: await context.set_extra_http_headers(self.config.headers) if self.config.cookies: await context.add_cookies(self.config.cookies) if self.config.storage_state: await context.storage_state(path=None) if self.config.accept_downloads: context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) if self.config.downloads_path: context._impl_obj._options["accept_downloads"] = True context._impl_obj._options[ "downloads_path" ] = self.config.downloads_path # Handle user agent and browser hints if self.config.user_agent: combined_headers = { "User-Agent": self.config.user_agent, "sec-ch-ua": self.config.browser_hint, } combined_headers.update(self.config.headers) await context.set_extra_http_headers(combined_headers) # Add default cookie (skip for raw:/file:// URLs which are not valid cookie URLs) cookie_url = None if crawlerRunConfig and crawlerRunConfig.url: url = crawlerRunConfig.url # Only set cookie for http/https URLs if url.startswith(("http://", "https://")): cookie_url = url elif crawlerRunConfig.base_url and crawlerRunConfig.base_url.startswith(("http://", "https://")): # Use base_url as fallback for raw:/file:// URLs cookie_url = crawlerRunConfig.base_url if cookie_url: await context.add_cookies( [ { "name": "cookiesEnabled", "value": "true", "url": cookie_url, } ] ) # Handle navigator overrides if crawlerRunConfig: if ( crawlerRunConfig.override_navigator or crawlerRunConfig.simulate_user or crawlerRunConfig.magic ): await context.add_init_script(load_js_script("navigator_overrider")) # Apply custom init_scripts from BrowserConfig (for stealth evasions, etc.) if self.config.init_scripts: for script in self.config.init_scripts: await context.add_init_script(script) async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None): """ Creates and returns a new browser context with configured settings. Applies text-only mode settings if text_mode is enabled in config. Returns: Context: Browser context object with the specified configurations """ # Base settings user_agent = self.config.headers.get("User-Agent", self.config.user_agent) viewport_settings = { "width": self.config.viewport_width, "height": self.config.viewport_height, } proxy_settings = {"server": self.config.proxy} if self.config.proxy else None blocked_extensions = [ # Images "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd", # Fonts "woff", "woff2", "ttf", "otf", "eot", # Styles # 'css', 'less', 'scss', 'sass', # Media "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac", "m4a", "opus", "flac", # Documents "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", # Archives "zip", "rar", "7z", "tar", "gz", # Scripts and data "xml", "swf", "wasm", ] # Common context settings context_settings = { "user_agent": user_agent, "viewport": viewport_settings, "proxy": proxy_settings, "accept_downloads": self.config.accept_downloads, "storage_state": self.config.storage_state, "ignore_https_errors": self.config.ignore_https_errors, "device_scale_factor": self.config.device_scale_factor, "java_script_enabled": self.config.java_script_enabled, } if crawlerRunConfig: # Check if there is value for crawlerRunConfig.proxy_config set add that to context if crawlerRunConfig.proxy_config: from playwright.async_api import ProxySettings proxy_settings = ProxySettings( server=crawlerRunConfig.proxy_config.server, username=crawlerRunConfig.proxy_config.username, password=crawlerRunConfig.proxy_config.password, ) context_settings["proxy"] = proxy_settings if self.config.text_mode: text_mode_settings = { "has_touch": False, "is_mobile": False, } # Update context settings with text mode settings context_settings.update(text_mode_settings) # inject locale / tz / geo if user provided them if crawlerRunConfig: if crawlerRunConfig.locale: context_settings["locale"] = crawlerRunConfig.locale if crawlerRunConfig.timezone_id: context_settings["timezone_id"] = crawlerRunConfig.timezone_id if crawlerRunConfig.geolocation: context_settings["geolocation"] = { "latitude": crawlerRunConfig.geolocation.latitude, "longitude": crawlerRunConfig.geolocation.longitude, "accuracy": crawlerRunConfig.geolocation.accuracy, } # ensure geolocation permission perms = context_settings.get("permissions", []) perms.append("geolocation") context_settings["permissions"] = perms # Create and return the context with all settings context = await self.browser.new_context(**context_settings) # Apply text mode settings if enabled if self.config.text_mode: # Create and apply route patterns for each extension for ext in blocked_extensions: await context.route(f"**/*.{ext}", lambda route: route.abort()) return context def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: """ Hash ONLY the CrawlerRunConfig fields that affect browser context creation (create_browser_context) or context setup (setup_context). Whitelist approach: fields like css_selector, word_count_threshold, screenshot, verbose, etc. do NOT cause a new context to be created. """ import json sig_dict = {} # Fields that flow into create_browser_context() pc = crawlerRunConfig.proxy_config if pc is not None: sig_dict["proxy_config"] = { "server": getattr(pc, "server", None), "username": getattr(pc, "username", None), "password": getattr(pc, "password", None), } else: sig_dict["proxy_config"] = None sig_dict["locale"] = crawlerRunConfig.locale sig_dict["timezone_id"] = crawlerRunConfig.timezone_id geo = crawlerRunConfig.geolocation if geo is not None: sig_dict["geolocation"] = { "latitude": geo.latitude, "longitude": geo.longitude, "accuracy": geo.accuracy, } else: sig_dict["geolocation"] = None # Fields that flow into setup_context() as init scripts sig_dict["override_navigator"] = crawlerRunConfig.override_navigator sig_dict["simulate_user"] = crawlerRunConfig.simulate_user sig_dict["magic"] = crawlerRunConfig.magic # Browser version — bumped on recycle to force new browser instance sig_dict["_browser_version"] = self._browser_version signature_json = json.dumps(sig_dict, sort_keys=True, default=str) return hashlib.sha256(signature_json.encode("utf-8")).hexdigest() def _evict_lru_context_locked(self): """ If contexts exceed the limit, find the least-recently-used context with zero active crawls and remove it from all tracking dicts. MUST be called while holding self._contexts_lock. Returns the BrowserContext to close (caller closes it OUTSIDE the lock), or None if no eviction is needed or possible. """ if len(self.contexts_by_config) <= self._max_contexts: return None # Sort candidates by last-used timestamp (oldest first) candidates = sorted( self._context_last_used.items(), key=lambda item: item[1], ) for evict_sig, _ in candidates: if self._context_refcounts.get(evict_sig, 0) == 0: ctx = self.contexts_by_config.pop(evict_sig, None) self._context_refcounts.pop(evict_sig, None) self._context_last_used.pop(evict_sig, None) # Clean up stale page->sig mappings for evicted context stale_pages = [ p for p, s in self._page_to_sig.items() if s == evict_sig ] for p in stale_pages: del self._page_to_sig[p] return ctx # All contexts are in active use — cannot evict return None async def _apply_stealth_to_page(self, page): """Apply stealth to a page if stealth mode is enabled""" if self._stealth_adapter: try: await self._stealth_adapter.apply_stealth(page) except Exception as e: if self.logger: self.logger.warning( message="Failed to apply stealth to page: {error}", tag="STEALTH", params={"error": str(e)} ) async def _get_page_by_target_id(self, context: BrowserContext, target_id: str): """ Get an existing page by its CDP target ID. This is used when connecting to a pre-created browser context with an existing page. Playwright may not immediately see targets created via raw CDP commands, so we use CDP to get all targets and find the matching one. Args: context: The browser context to search in target_id: The CDP target ID to find Returns: Page object if found, None otherwise """ try: # First check if Playwright already sees the page for page in context.pages: # Playwright's internal target ID might match if hasattr(page, '_impl_obj') and hasattr(page._impl_obj, '_target_id'): if page._impl_obj._target_id == target_id: return page # If not found, try using CDP to get targets if hasattr(self.browser, '_impl_obj') and hasattr(self.browser._impl_obj, '_connection'): cdp_session = await context.new_cdp_session(context.pages[0] if context.pages else None) if cdp_session: try: result = await cdp_session.send("Target.getTargets") targets = result.get("targetInfos", []) for target in targets: if target.get("targetId") == target_id: # Found the target - if it's a page type, we can use it if target.get("type") == "page": # The page exists, let Playwright discover it await asyncio.sleep(0.1) # Refresh pages list if context.pages: return context.pages[0] finally: await cdp_session.detach() # Fallback: if there are any pages now, return the first one if context.pages: return context.pages[0] return None except Exception as e: if self.logger: self.logger.warning( message="Failed to get page by target ID: {error}", tag="BROWSER", params={"error": str(e)} ) return None async def get_page(self, crawlerRunConfig: CrawlerRunConfig): """ Get a page for the given session ID, creating a new one if needed. Args: crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings Returns: (page, context): The Page and its BrowserContext """ self._cleanup_expired_sessions() # If a session_id is provided and we already have it, reuse that page + context if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: context, page, _ = self.sessions[crawlerRunConfig.session_id] # Update last-used timestamp self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) return page, context # If using a managed browser, just grab the shared default_context if self.config.use_managed_browser: # If create_isolated_context is True, create isolated contexts for concurrent crawls # Uses the same caching mechanism as non-CDP mode: cache context by config signature, # but always create a new page. This prevents navigation conflicts while allowing # context reuse for multiple URLs with the same config (e.g., batch/deep crawls). if self.config.create_isolated_context: config_signature = self._make_config_signature(crawlerRunConfig) to_close = None async with self._contexts_lock: if config_signature in self.contexts_by_config: context = self.contexts_by_config[config_signature] else: context = await self.create_browser_context(crawlerRunConfig) await self.setup_context(context, crawlerRunConfig) self.contexts_by_config[config_signature] = context self._context_refcounts[config_signature] = 0 to_close = self._evict_lru_context_locked() # Increment refcount INSIDE lock before releasing self._context_refcounts[config_signature] = ( self._context_refcounts.get(config_signature, 0) + 1 ) self._context_last_used[config_signature] = time.monotonic() # Close evicted context OUTSIDE lock if to_close is not None: try: await to_close.close() except Exception: pass # Always create a new page for each crawl (isolation for navigation) try: page = await context.new_page() except Exception: async with self._contexts_lock: if config_signature in self._context_refcounts: self._context_refcounts[config_signature] = max( 0, self._context_refcounts[config_signature] - 1 ) raise await self._apply_stealth_to_page(page) self._page_to_sig[page] = config_signature elif self.config.storage_state: tmp_context = await self.create_browser_context(crawlerRunConfig) ctx = self.default_context # default context, one window only ctx = await clone_runtime_state(tmp_context, ctx, crawlerRunConfig, self.config) # Close the temporary context — only needed as a clone source try: await tmp_context.close() except Exception: pass context = ctx # so (page, context) return value is correct # Avoid concurrent new_page on shared persistent context # See GH-1198: context.pages can be empty under races async with self._page_lock: page = await ctx.new_page() await self._apply_stealth_to_page(page) else: context = self.default_context # Handle pre-existing target case (for reconnecting to specific CDP targets) if self.config.browser_context_id and self.config.target_id: page = await self._get_page_by_target_id(context, self.config.target_id) if not page: async with self._page_lock: page = await context.new_page() self._mark_page_in_use(page) await self._apply_stealth_to_page(page) else: # Mark pre-existing target as in use self._mark_page_in_use(page) else: # For CDP connections (external browser), multiple Playwright connections # create separate browser/context objects. Page reuse across connections # isn't reliable because each connection sees different page objects. # Always create new pages for CDP to avoid cross-connection race conditions. if self.config.cdp_url and not self.config.use_managed_browser: async with self._page_lock: page = await context.new_page() self._mark_page_in_use(page) await self._apply_stealth_to_page(page) else: # For managed browsers (single process), page reuse is safe. # Use lock to safely check for available pages and track usage. # This prevents race conditions when multiple crawls run concurrently. async with BrowserManager._get_global_lock(): pages = context.pages pages_in_use = self._get_pages_in_use() # Find first available page (exists and not currently in use) available_page = next( (p for p in pages if p not in pages_in_use), None ) if available_page: page = available_page else: # No available pages - create a new one page = await context.new_page() await self._apply_stealth_to_page(page) # Mark page as in use (global tracking) self._mark_page_in_use(page) else: # Otherwise, check if we have an existing context for this config config_signature = self._make_config_signature(crawlerRunConfig) to_close = None async with self._contexts_lock: if config_signature in self.contexts_by_config: context = self.contexts_by_config[config_signature] else: # Create and setup a new context context = await self.create_browser_context(crawlerRunConfig) await self.setup_context(context, crawlerRunConfig) self.contexts_by_config[config_signature] = context self._context_refcounts[config_signature] = 0 to_close = self._evict_lru_context_locked() # Increment refcount INSIDE lock before releasing self._context_refcounts[config_signature] = ( self._context_refcounts.get(config_signature, 0) + 1 ) self._context_last_used[config_signature] = time.monotonic() # Close evicted context OUTSIDE lock if to_close is not None: try: await to_close.close() except Exception: pass # Create a new page from the chosen context try: page = await context.new_page() except Exception: async with self._contexts_lock: if config_signature in self._context_refcounts: self._context_refcounts[config_signature] = max( 0, self._context_refcounts[config_signature] - 1 ) raise await self._apply_stealth_to_page(page) self._page_to_sig[page] = config_signature # If a session_id is specified, store this session so we can reuse later if crawlerRunConfig.session_id: self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) self._pages_served += 1 # Check if browser recycle threshold is hit — bump version for next requests # This happens AFTER incrementing counter so concurrent requests see correct count await self._maybe_bump_browser_version() return page, context async def kill_session(self, session_id: str): """ Kill a browser session and clean up resources. Args: session_id (str): The session ID to kill. """ if session_id in self.sessions: context, page, _ = self.sessions[session_id] self._release_page_from_use(page) # Decrement context refcount for the session's page should_close_context = False async with self._contexts_lock: sig = self._page_to_sig.pop(page, None) if sig is not None and sig in self._context_refcounts: self._context_refcounts[sig] = max( 0, self._context_refcounts[sig] - 1 ) # Only close the context if no other pages are using it # (refcount dropped to 0) AND we own the context (not managed) if not self.config.use_managed_browser: if self._context_refcounts.get(sig, 0) == 0: self.contexts_by_config.pop(sig, None) self._context_refcounts.pop(sig, None) self._context_last_used.pop(sig, None) should_close_context = True await page.close() if should_close_context: await context.close() del self.sessions[session_id] def release_page(self, page): """ Release a page from the in-use tracking set (global tracking). Sync variant — does NOT decrement context refcount. """ self._release_page_from_use(page) async def release_page_with_context(self, page): """ Release a page and decrement its context's refcount under the lock. Should be called from the async crawl finally block instead of release_page() so the context lifecycle is properly tracked. """ self._release_page_from_use(page) sig = None refcount = -1 async with self._contexts_lock: sig = self._page_to_sig.pop(page, None) if sig is not None and sig in self._context_refcounts: self._context_refcounts[sig] = max( 0, self._context_refcounts[sig] - 1 ) refcount = self._context_refcounts[sig] # Check if this signature belongs to an old browser waiting to be cleaned up if sig is not None and refcount == 0: await self._maybe_cleanup_old_browser(sig) def _should_recycle(self) -> bool: """Check if page threshold reached for browser recycling.""" limit = self.config.max_pages_before_recycle if limit <= 0: return False return self._pages_served >= limit async def _maybe_bump_browser_version(self): """Bump browser version if threshold reached, moving old browser to pending cleanup. New requests automatically get a new browser (via new signature). Old browser drains naturally and gets cleaned up when refcount hits 0. """ if not self._should_recycle(): return # Safety cap: wait if too many old browsers are draining while True: async with self._pending_cleanup_lock: # Re-check threshold under lock (another request may have bumped already) if not self._should_recycle(): return # Check safety cap if len(self._pending_cleanup) >= self._max_pending_browsers: if self.logger: self.logger.debug( message="Waiting for old browser to drain (pending: {count})", tag="BROWSER", params={"count": len(self._pending_cleanup)}, ) self._cleanup_slot_available.clear() # Release lock and wait else: # We have a slot — do the bump inside this lock hold old_version = self._browser_version old_sigs = [] async with self._contexts_lock: for sig in list(self._context_refcounts.keys()): old_sigs.append(sig) if self.logger: self.logger.info( message="Bumping browser version {old} -> {new} after {count} pages", tag="BROWSER", params={ "old": old_version, "new": old_version + 1, "count": self._pages_served, }, ) # Mark old signatures for cleanup when their refcount hits 0 done_event = asyncio.Event() for sig in old_sigs: self._pending_cleanup[sig] = { "version": old_version, "done": done_event, } # Bump version — new get_page() calls will create new contexts self._browser_version += 1 self._pages_served = 0 return # Done! # If we get here, we need to wait for a cleanup slot await self._cleanup_slot_available.wait() async def _maybe_cleanup_old_browser(self, sig: str): """Clean up an old browser's context if its refcount hit 0 and it's pending cleanup.""" async with self._pending_cleanup_lock: if sig not in self._pending_cleanup: return # Not an old browser signature cleanup_info = self._pending_cleanup.pop(sig) old_version = cleanup_info["version"] if self.logger: self.logger.debug( message="Cleaning up context from browser version {version} (sig: {sig})", tag="BROWSER", params={"version": old_version, "sig": sig[:12]}, ) # Remove context from tracking async with self._contexts_lock: context = self.contexts_by_config.pop(sig, None) self._context_refcounts.pop(sig, None) self._context_last_used.pop(sig, None) # Close context outside locks if context is not None: try: await context.close() except Exception: pass # Check if any signatures from this old version remain remaining_old = [ s for s, info in self._pending_cleanup.items() if info["version"] == old_version ] if not remaining_old: if self.logger: self.logger.info( message="All contexts from browser version {version} cleaned up", tag="BROWSER", params={"version": old_version}, ) # Open a cleanup slot if we're below the cap if len(self._pending_cleanup) < self._max_pending_browsers: self._cleanup_slot_available.set() def _cleanup_expired_sessions(self): """Clean up expired sessions based on TTL.""" current_time = time.time() expired_sessions = [ sid for sid, (_, _, last_used) in self.sessions.items() if current_time - last_used > self.session_ttl ] for sid in expired_sessions: asyncio.create_task(self.kill_session(sid)) async def close(self): """Close all browser resources and clean up.""" # Cached CDP path: only clean up this instance's sessions/contexts, # then release the shared connection reference. if self._using_cached_cdp: session_ids = list(self.sessions.keys()) for session_id in session_ids: await self.kill_session(session_id) for ctx in self.contexts_by_config.values(): try: await ctx.close() except Exception: pass self.contexts_by_config.clear() self._context_refcounts.clear() self._context_last_used.clear() self._page_to_sig.clear() await _CDPConnectionCache.release(self.config.cdp_url) self.browser = None self.playwright = None self._using_cached_cdp = False return if self.config.cdp_url: # When using external CDP, we don't own the browser process. # If cdp_cleanup_on_close is True, properly disconnect from the browser # and clean up Playwright resources. This frees the browser for other clients. if self.config.cdp_cleanup_on_close: # First close all sessions (pages) session_ids = list(self.sessions.keys()) for session_id in session_ids: await self.kill_session(session_id) # Close all contexts we created for ctx in self.contexts_by_config.values(): try: await ctx.close() except Exception: pass self.contexts_by_config.clear() self._context_refcounts.clear() self._context_last_used.clear() self._page_to_sig.clear() # Disconnect from browser (doesn't terminate it, just releases connection) if self.browser: try: await self.browser.close() except Exception as e: if self.logger: self.logger.debug( message="Error disconnecting from CDP browser: {error}", tag="BROWSER", params={"error": str(e)} ) self.browser = None # Allow time for CDP connection to fully release before another client connects if self.config.cdp_close_delay > 0: await asyncio.sleep(self.config.cdp_close_delay) # Stop Playwright instance to prevent memory leaks if self.playwright: await self.playwright.stop() self.playwright = None return if self.config.sleep_on_close: await asyncio.sleep(0.5) session_ids = list(self.sessions.keys()) for session_id in session_ids: await self.kill_session(session_id) # Now close all contexts we created. This reclaims memory from ephemeral contexts. for ctx in self.contexts_by_config.values(): try: await ctx.close() except Exception as e: self.logger.error( message="Error closing context: {error}", tag="ERROR", params={"error": str(e)} ) self.contexts_by_config.clear() self._context_refcounts.clear() self._context_last_used.clear() self._page_to_sig.clear() if self.browser: await self.browser.close() self.browser = None if self.managed_browser: await asyncio.sleep(0.5) await self.managed_browser.cleanup() self.managed_browser = None if self.playwright: await self.playwright.stop() self.playwright = None