diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index a287cfbe..9198fa1d 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -270,7 +270,7 @@ class BrowserConfig: host: str = "localhost", ): self.browser_type = browser_type - self.headless = headless + self.headless = headless and "new" or False self.browser_mode = browser_mode self.use_managed_browser = use_managed_browser self.cdp_url = cdp_url diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index bbee502b..fca2d673 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -4,18 +4,25 @@ import sys import time from colorama import Fore from pathlib import Path -from typing import Optional, List, Generic, TypeVar +from typing import Optional, List import json import asyncio # from contextlib import nullcontext, asynccontextmanager from contextlib import asynccontextmanager -from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult +from .models import ( + CrawlResult, + MarkdownGenerationResult, + DispatchResult, + ScrapingResult, + CrawlResultContainer, + RunManyReturn +) from .async_database import async_db_manager from .chunking_strategy import * # noqa: F403 from .chunking_strategy import IdentityChunking from .content_filter_strategy import * # noqa: F403 -from .extraction_strategy import * # noqa: F403 +from .extraction_strategy import * # noqa: F403 from .extraction_strategy import NoExtractionStrategy from .async_crawler_strategy import ( AsyncCrawlerStrategy, @@ -30,7 +37,7 @@ from .markdown_generation_strategy import ( from .deep_crawling import DeepCrawlDecorator from .async_logger import AsyncLogger, AsyncLoggerBase from .async_configs import BrowserConfig, CrawlerRunConfig -from .async_dispatcher import * # noqa: F403 +from .async_dispatcher import * # noqa: F403 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter from .utils import ( @@ -42,45 +49,6 @@ from .utils import ( RobotsParser, ) -from typing import Union, AsyncGenerator - -CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) -# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] - -class CrawlResultContainer(Generic[CrawlResultT]): - def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]): - # Normalize to a list - if isinstance(results, list): - self._results = results - else: - self._results = [results] - - def __iter__(self): - return iter(self._results) - - def __getitem__(self, index): - return self._results[index] - - def __len__(self): - return len(self._results) - - def __getattr__(self, attr): - # Delegate attribute access to the first element. - if self._results: - return getattr(self._results[0], attr) - raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'") - - def __repr__(self): - return f"{self.__class__.__name__}({self._results!r})" - -# Redefine the union type. Now synchronous calls always return a container, -# while stream mode is handled with an AsyncGenerator. -RunManyReturn = Union[ - CrawlResultContainer[CrawlResultT], - AsyncGenerator[CrawlResultT, None] -] - - class AsyncWebCrawler: """ @@ -193,45 +161,18 @@ class AsyncWebCrawler: # Decorate arun method with deep crawling capabilities self._deep_handler = DeepCrawlDecorator(self) - self.arun = self._deep_handler(self.arun) + self.arun = self._deep_handler(self.arun) async def start(self): """ Start the crawler explicitly without using context manager. This is equivalent to using 'async with' but gives more control over the lifecycle. - - This method will: - 1. Check for builtin browser if browser_mode is 'builtin' - 2. Initialize the browser and context - 3. Perform warmup sequence - 4. Return the crawler instance for method chaining - Returns: AsyncWebCrawler: The initialized crawler instance """ - # Check for builtin browser if requested - if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url: - # Import here to avoid circular imports - from .browser_profiler import BrowserProfiler - profiler = BrowserProfiler(logger=self.logger) - - # Get builtin browser info or launch if needed - browser_info = profiler.get_builtin_browser_info() - if not browser_info: - self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER") - cdp_url = await profiler.launch_builtin_browser() - if not cdp_url: - self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER") - else: - self.browser_config.cdp_url = cdp_url - self.browser_config.use_managed_browser = True - else: - self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER") - self.browser_config.cdp_url = browser_info.get('cdp_url') - self.browser_config.use_managed_browser = True - await self.crawler_strategy.__aenter__() - await self.awarmup() + self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") + self.ready = True return self async def close(self): @@ -251,18 +192,6 @@ class AsyncWebCrawler: async def __aexit__(self, exc_type, exc_val, exc_tb): await self.close() - async def awarmup(self): - """ - Initialize the crawler with warm-up sequence. - - This method: - 1. Logs initialization info - 2. Sets up browser configuration - 3. Marks the crawler as ready - """ - self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") - self.ready = True - @asynccontextmanager async def nullcontext(self): """异步空上下文管理器""" @@ -305,7 +234,7 @@ class AsyncWebCrawler: # Auto-start if not ready if not self.ready: await self.start() - + config = config or CrawlerRunConfig() if not isinstance(url, str) or not url: raise ValueError("Invalid URL, make sure the URL is a non-empty string") @@ -319,9 +248,7 @@ class AsyncWebCrawler: config.cache_mode = CacheMode.ENABLED # Create cache context - cache_context = CacheContext( - url, config.cache_mode, False - ) + cache_context = CacheContext(url, config.cache_mode, False) # Initialize processing variables async_response: AsyncCrawlResponse = None @@ -351,7 +278,7 @@ class AsyncWebCrawler: # if config.screenshot and not screenshot or config.pdf and not pdf: if config.screenshot and not screenshot_data: cached_result = None - + if config.pdf and not pdf_data: cached_result = None @@ -383,14 +310,18 @@ class AsyncWebCrawler: # Check robots.txt if enabled if config and config.check_robots_txt: - if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent): + if not await self.robots_parser.can_fetch( + url, self.browser_config.user_agent + ): return CrawlResult( url=url, html="", success=False, status_code=403, error_message="Access denied by robots.txt", - response_headers={"X-Robots-Status": "Blocked by robots.txt"} + response_headers={ + "X-Robots-Status": "Blocked by robots.txt" + }, ) ############################## @@ -417,7 +348,7 @@ class AsyncWebCrawler: ############################################################### # Process the HTML content, Call CrawlerStrategy.process_html # ############################################################### - crawl_result : CrawlResult = await self.aprocess_html( + crawl_result: CrawlResult = await self.aprocess_html( url=url, html=html, extracted_content=extracted_content, @@ -494,7 +425,7 @@ class AsyncWebCrawler: tag="ERROR", ) - return CrawlResultContainer( + return CrawlResultContainer( CrawlResult( url=url, html="", success=False, error_message=error_message ) @@ -539,15 +470,14 @@ class AsyncWebCrawler: # Process HTML content params = config.__dict__.copy() - params.pop("url", None) + params.pop("url", None) # add keys from kwargs to params that doesn't exist in params params.update({k: v for k, v in kwargs.items() if k not in params.keys()}) - ################################ # Scraping Strategy Execution # ################################ - result : ScrapingResult = scraping_strategy.scrap(url, html, **params) + result: ScrapingResult = scraping_strategy.scrap(url, html, **params) if result is None: raise ValueError( @@ -596,7 +526,10 @@ class AsyncWebCrawler: self.logger.info( message="{url:.50}... | Time: {timing}s", tag="SCRAPE", - params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000}, + params={ + "url": _url, + "timing": int((time.perf_counter() - t1) * 1000) / 1000, + }, ) ################################ @@ -671,7 +604,7 @@ class AsyncWebCrawler: async def arun_many( self, urls: List[str], - config: Optional[CrawlerRunConfig] = None, + config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, # Legacy parameters maintained for backwards compatibility # word_count_threshold=MIN_WORD_THRESHOLD, @@ -685,8 +618,8 @@ class AsyncWebCrawler: # pdf: bool = False, # user_agent: str = None, # verbose=True, - **kwargs - ) -> RunManyReturn: + **kwargs, + ) -> RunManyReturn: """ Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy. @@ -742,37 +675,32 @@ class AsyncWebCrawler: def transform_result(task_result): return ( - setattr(task_result.result, 'dispatch_result', - DispatchResult( - task_id=task_result.task_id, - memory_usage=task_result.memory_usage, - peak_memory=task_result.peak_memory, - start_time=task_result.start_time, - end_time=task_result.end_time, - error_message=task_result.error_message, - ) - ) or task_result.result + setattr( + task_result.result, + "dispatch_result", + DispatchResult( + task_id=task_result.task_id, + memory_usage=task_result.memory_usage, + peak_memory=task_result.peak_memory, + start_time=task_result.start_time, + end_time=task_result.end_time, + error_message=task_result.error_message, + ), ) + or task_result.result + ) stream = config.stream - + if stream: + async def result_transformer(): - async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config): + async for task_result in dispatcher.run_urls_stream( + crawler=self, urls=urls, config=config + ): yield transform_result(task_result) + return result_transformer() else: _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config) - return [transform_result(res) for res in _results] - - async def aclear_cache(self): - """Clear the cache database.""" - await async_db_manager.cleanup() - - async def aflush_cache(self): - """Flush the cache database.""" - await async_db_manager.aflush_db() - - async def aget_cache_size(self): - """Get the total number of cached items.""" - return await async_db_manager.aget_total_count() + return [transform_result(res) for res in _results] diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py index 288ad7e9..129a940b 100644 --- a/crawl4ai/browser/manager.py +++ b/crawl4ai/browser/manager.py @@ -50,7 +50,7 @@ class BrowserManager: self.logger = logger # Create strategy based on configuration - self._strategy = self._create_strategy() + self.strategy = self._create_strategy() # Initialize state variables for compatibility with existing code self.browser = None @@ -92,23 +92,23 @@ class BrowserManager: self: For method chaining """ # Start the strategy - await self._strategy.start() + await self.strategy.start() # Update legacy references - self.browser = self._strategy.browser - self.default_context = self._strategy.default_context + self.browser = self.strategy.browser + self.default_context = self.strategy.default_context # Set browser process reference (for CDP strategy) - if hasattr(self._strategy, 'browser_process'): - self.managed_browser = self._strategy + if hasattr(self.strategy, 'browser_process'): + self.managed_browser = self.strategy # Set Playwright reference - self.playwright = self._strategy.playwright + self.playwright = self.strategy.playwright # Sync sessions if needed - if hasattr(self._strategy, 'sessions'): - self.sessions = self._strategy.sessions - self.session_ttl = self._strategy.session_ttl + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions + self.session_ttl = self.strategy.session_ttl return self @@ -122,11 +122,11 @@ class BrowserManager: Tuple of (Page, BrowserContext) """ # Delegate to strategy - page, context = await self._strategy.get_page(crawlerRunConfig) + page, context = await self.strategy.get_page(crawlerRunConfig) # Sync sessions if needed - if hasattr(self._strategy, 'sessions'): - self.sessions = self._strategy.sessions + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions return page, context @@ -144,14 +144,15 @@ class BrowserManager: List of (Page, Context) tuples """ # Delegate to strategy - pages = await self._strategy.get_pages(crawlerRunConfig, count) + pages = await self.strategy.get_pages(crawlerRunConfig, count) # Sync sessions if needed - if hasattr(self._strategy, 'sessions'): - self.sessions = self._strategy.sessions + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions return pages + # Just for legacy compatibility async def kill_session(self, session_id: str): """Kill a browser session and clean up resources. @@ -159,33 +160,16 @@ class BrowserManager: session_id: The session ID to kill """ # Handle kill_session via our strategy if it supports it - await self._strategy.kill_session(session_id) + await self.strategy.kill_session(session_id) # sync sessions if needed - if hasattr(self._strategy, 'sessions'): - self.sessions = self._strategy.sessions - - def _cleanup_expired_sessions(self): - """Clean up expired sessions based on TTL.""" - # Use strategy's implementation if available - if hasattr(self._strategy, '_cleanup_expired_sessions'): - self._strategy._cleanup_expired_sessions() - return - - # Otherwise use our own implementation - current_time = time.time() - expired_sessions = [ - sid - for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self.kill_session(sid)) + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions async def close(self): """Close the browser and clean up resources.""" # Delegate to strategy - await self._strategy.close() + await self.strategy.close() # Reset legacy references self.browser = None diff --git a/crawl4ai/browser/strategies/base.py b/crawl4ai/browser/strategies/base.py index 2c500389..5c46cbe4 100644 --- a/crawl4ai/browser/strategies/base.py +++ b/crawl4ai/browser/strategies/base.py @@ -82,6 +82,9 @@ class BaseBrowserStrategy(ABC): return self @abstractmethod + async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + pass + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: """Get a page with specified configuration. @@ -94,6 +97,23 @@ class BaseBrowserStrategy(ABC): Returns: Tuple of (Page, BrowserContext) """ + # Clean up expired sessions first + self._cleanup_expired_sessions() + + # If a session_id is provided and we already have it, reuse that page + context + if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: + context, page, _ = self.sessions[crawlerRunConfig.session_id] + # Update last-used timestamp + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + return page, context + + page, context = await self._generate_page(crawlerRunConfig) + + # If a session_id is specified, store this session so we can reuse later + if crawlerRunConfig.session_id: + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + + return page, context pass async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]: @@ -120,31 +140,29 @@ class BaseBrowserStrategy(ABC): """ # Define common browser arguments that improve performance and stability args = [ - "--disable-gpu", - "--disable-gpu-compositing", - "--disable-software-rasterizer", "--no-sandbox", - "--disable-dev-shm-usage", "--no-first-run", "--no-default-browser-check", - "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", "--ignore-certificate-errors-spki-list", - "--disable-blink-features=AutomationControlled", "--window-position=400,0", - "--disable-renderer-backgrounding", - "--disable-ipc-flooding-protection", "--force-color-profile=srgb", "--mute-audio", + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + "--disable-infobars", + "--disable-blink-features=AutomationControlled", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", "--disable-background-timer-throttling", f"--window-size={self.config.viewport_width},{self.config.viewport_height}", ] # Define browser disable options for light mode browser_disable_options = [ - "--disable-background-networking", - "--disable-background-timer-throttling", "--disable-backgrounding-occluded-windows", "--disable-breakpad", "--disable-client-side-phishing-detection", @@ -153,13 +171,10 @@ class BaseBrowserStrategy(ABC): "--disable-extensions", "--disable-features=TranslateUI", "--disable-hang-monitor", - "--disable-ipc-flooding-protection", "--disable-popup-blocking", "--disable-prompt-on-repost", "--disable-sync", - "--force-color-profile=srgb", "--metrics-recording-only", - "--no-first-run", "--password-store=basic", "--use-mock-keychain", ] diff --git a/crawl4ai/browser/strategies/builtin.py b/crawl4ai/browser/strategies/builtin.py index 2423ee04..678346fc 100644 --- a/crawl4ai/browser/strategies/builtin.py +++ b/crawl4ai/browser/strategies/builtin.py @@ -115,24 +115,11 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): except Exception as e: if self.logger: self.logger.error(f"Failed to start built-in browser: {str(e)}", tag="BUILTIN") + + # There is a possibility that at this point I need to clean up some resourece raise - async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: - """Get a page for the given configuration. - - Inherits behavior from CDPBrowserStrategy for page management. - - Args: - crawlerRunConfig: Configuration object for the crawler run - - Returns: - Tuple of (Page, BrowserContext) - """ - # For built-in browsers, we use the same page management as CDP strategy - return await super().get_page(crawlerRunConfig) - - @classmethod - def get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]: + def _get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]: """Get information about the built-in browser for a specific debugging port. Args: @@ -157,15 +144,14 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): browser_info = browser_info_dict["port_map"][port_str] # Check if the browser is still running - pids = browser_info.get('pid') - if type(pids) == str and len(pids.split("\n")) > 1: - pids = [int(pid) for pid in pids.split("\n") if pid.isdigit()] - elif type(pids) == str and pids.isdigit(): - pids = [int(pids)] - elif type(pids) == int: + pids = browser_info.get('pid', '') + if isinstance(pids, str): + pids = [int(pid) for pid in pids.split() if pid.isdigit()] + elif isinstance(pids, int): pids = [pids] else: pids = [] + # Check if any of the PIDs are running if not pids: if logger: @@ -205,7 +191,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): Returns: dict: Browser information or None if no running browser is configured """ - return self.get_builtin_browser_info( + return self._get_builtin_browser_info( debugging_port=self.config.debugging_port, config_file=self.builtin_config_file, logger=self.logger @@ -226,7 +212,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): str: CDP URL for the browser, or None if launch failed """ # Check if there's an existing browser still running - browser_info = self.get_builtin_browser_info( + browser_info = self._get_builtin_browser_info( debugging_port=debugging_port, config_file=self.builtin_config_file, logger=self.logger @@ -238,6 +224,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): # Create a user data directory for the built-in browser user_data_dir = os.path.join(self.builtin_browser_dir, "user_data") + # Raise error if user data dir is already engaged if self._check_user_dir_is_engaged(user_data_dir): raise Exception(f"User data directory {user_data_dir} is already engaged by another browser instance.") @@ -246,15 +233,19 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): os.makedirs(user_data_dir, exist_ok=True) # Prepare browser launch arguments + browser_args = super()._build_browser_args() browser_path = await get_browser_executable(browser_type) + base_args = [browser_path] + if browser_type == "chromium": args = [ browser_path, f"--remote-debugging-port={debugging_port}", f"--user-data-dir={user_data_dir}", ] - if headless: - args.append("--headless=new") + # if headless: + # args.append("--headless=new") + elif browser_type == "firefox": args = [ browser_path, @@ -270,6 +261,8 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): self.logger.error(f"Browser type {browser_type} not supported for built-in browser", tag="BUILTIN") return None + args = base_args + browser_args + args + try: # Check if the port is already in use @@ -333,11 +326,12 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): # Check if it already uses port mapping if isinstance(existing_data, dict) and "port_map" in existing_data: port_map = existing_data["port_map"] - # Convert legacy format to port mapping - elif isinstance(existing_data, dict) and "debugging_port" in existing_data: - old_port = str(existing_data.get("debugging_port")) - if self._is_browser_running(existing_data.get("pid")): - port_map[old_port] = existing_data + + # # Convert legacy format to port mapping + # elif isinstance(existing_data, dict) and "debugging_port" in existing_data: + # old_port = str(existing_data.get("debugging_port")) + # if self._is_browser_running(existing_data.get("pid")): + # port_map[old_port] = existing_data except Exception as e: if self.logger: self.logger.warning(f"Could not read existing config: {str(e)}", tag="BUILTIN") @@ -413,15 +407,19 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): # Update config file to remove this browser with open(self.builtin_config_file, 'r') as f: browser_info_dict = json.load(f) + # Remove this port from the dictionary port_str = str(self.config.debugging_port) if port_str in browser_info_dict.get("port_map", {}): del browser_info_dict["port_map"][port_str] + with open(self.builtin_config_file, 'w') as f: json.dump(browser_info_dict, f, indent=2) + # Remove user data directory if it exists if os.path.exists(self.builtin_browser_dir): shutil.rmtree(self.builtin_browser_dir) + # Clear the browser info cache self.browser = None self.temp_dir = None @@ -460,14 +458,11 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): async def close(self): """Close the built-in browser and clean up resources.""" - # Store the shutting_down state - was_shutting_down = getattr(self, 'shutting_down', False) - # Call parent class close method await super().close() # Clean up built-in browser if we created it and were in shutdown mode - if was_shutting_down: + if self.shutting_down: await self.kill_builtin_browser() if self.logger: self.logger.debug("Killed built-in browser during shutdown", tag="BUILTIN") \ No newline at end of file diff --git a/crawl4ai/browser/strategies/cdp.py b/crawl4ai/browser/strategies/cdp.py index 26eba00e..e5982065 100644 --- a/crawl4ai/browser/strategies/cdp.py +++ b/crawl4ai/browser/strategies/cdp.py @@ -68,9 +68,11 @@ class CDPBrowserStrategy(BaseBrowserStrategy): if self.logger: self.logger.debug(f"Connected to CDP browser at {cdp_url}", tag="CDP") + except Exception as e: if self.logger: self.logger.error(f"Failed to connect to CDP browser: {str(e)}", tag="CDP") + # Clean up any resources before re-raising await self._cleanup_process() raise @@ -95,7 +97,32 @@ class CDPBrowserStrategy(BaseBrowserStrategy): user_data_dir = self.config.user_data_dir # Get browser args based on OS and browser type - args = await self._get_browser_args(user_data_dir) + # args = await self._get_browser_args(user_data_dir) + browser_args = super()._build_browser_args() + browser_path = await get_browser_executable(self.config.browser_type) + base_args = [browser_path] + + if self.config.browser_type == "chromium": + args = [ + f"--remote-debugging-port={self.config.debugging_port}", + f"--user-data-dir={user_data_dir}", + ] + # if self.config.headless: + # args.append("--headless=new") + + elif self.config.browser_type == "firefox": + args = [ + "--remote-debugging-port", + str(self.config.debugging_port), + "--profile", + user_data_dir, + ] + if self.config.headless: + args.append("--headless") + else: + raise NotImplementedError(f"Browser type {self.config.browser_type} not supported") + + args = base_args + browser_args + args # Start browser process try: @@ -136,40 +163,6 @@ class CDPBrowserStrategy(BaseBrowserStrategy): except Exception as e: await self._cleanup_process() raise Exception(f"Failed to start browser: {e}") - - async def _get_browser_args(self, user_data_dir: str) -> List[str]: - """Returns browser-specific command line arguments. - - Args: - user_data_dir: Path to user data directory - - Returns: - List of command-line arguments for the browser - """ - browser_args = super()._build_browser_args() - browser_path = await get_browser_executable(self.config.browser_type) - base_args = [browser_path] - - if self.config.browser_type == "chromium": - args = [ - f"--remote-debugging-port={self.config.debugging_port}", - f"--user-data-dir={user_data_dir}", - ] - if self.config.headless: - args.append("--headless=new") - elif self.config.browser_type == "firefox": - args = [ - "--remote-debugging-port", - str(self.config.debugging_port), - "--profile", - user_data_dir, - ] - if self.config.headless: - args.append("--headless") - else: - raise NotImplementedError(f"Browser type {self.config.browser_type} not supported") - - return base_args + browser_args + args async def _cleanup_process(self): """Cleanup browser process and temporary directory.""" @@ -204,15 +197,40 @@ class CDPBrowserStrategy(BaseBrowserStrategy): if self.temp_dir and os.path.exists(self.temp_dir): try: shutil.rmtree(self.temp_dir) + self.temp_dir = None + if self.logger: + self.logger.debug("Removed temporary directory", tag="CDP") except Exception as e: if self.logger: self.logger.error( message="Error removing temporary directory: {error}", - tag="ERROR", - params={"error": str(e)}, + tag="CDP", + params={"error": str(e)} ) + + self.browser_process = None - async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + # For CDP, we typically use the shared default_context + context = self.default_context + pages = context.pages + + # Otherwise, check if we have an existing context for this config + config_signature = self._make_config_signature(crawlerRunConfig) + self.contexts_by_config[config_signature] = context + + await self.setup_context(context, crawlerRunConfig) + + # Check if there's already a page with the target URL + page = next((p for p in pages if p.url == crawlerRunConfig.url), None) + + # If not found, create a new page + if not page: + page = await context.new_page() + + return page, context + + async def _get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: """Get a page for the given configuration. Args: @@ -221,15 +239,8 @@ class CDPBrowserStrategy(BaseBrowserStrategy): Returns: Tuple of (Page, BrowserContext) """ - # Clean up expired sessions using base class method - self._cleanup_expired_sessions() - - # If a session_id is provided and we already have it, reuse that page + context - if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: - context, page, _ = self.sessions[crawlerRunConfig.session_id] - # Update last-used timestamp - self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) - return page, context + # Call parent method to ensure browser is started + await super().get_page(crawlerRunConfig) # For CDP, we typically use the shared default_context context = self.default_context @@ -266,24 +277,5 @@ class CDPBrowserStrategy(BaseBrowserStrategy): await super().close() # Additional CDP-specific cleanup - if self.browser_process: - await asyncio.sleep(0.5) - await self._cleanup_process() - self.browser_process = None - if self.logger: - self.logger.debug("Cleaned up CDP browser process", tag="CDP") - - # Clean up temporary directory - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - self.temp_dir = None - if self.logger: - self.logger.debug("Removed temporary directory", tag="CDP") - except Exception as e: - if self.logger: - self.logger.error( - message="Error removing temporary directory: {error}", - tag="CDP", - params={"error": str(e)} - ) \ No newline at end of file + await asyncio.sleep(0.5) + await self._cleanup_process() diff --git a/crawl4ai/browser/strategies/docker_strategy.py b/crawl4ai/browser/strategies/docker_strategy.py index a71d48e9..5390fc8a 100644 --- a/crawl4ai/browser/strategies/docker_strategy.py +++ b/crawl4ai/browser/strategies/docker_strategy.py @@ -15,7 +15,7 @@ from ..models import DockerConfig from ..docker_registry import DockerRegistry from ..docker_utils import DockerUtils from .builtin import CDPBrowserStrategy - +from .base import BaseBrowserStrategy class DockerBrowserStrategy(CDPBrowserStrategy): """Docker-based browser strategy. @@ -79,9 +79,7 @@ class DockerBrowserStrategy(CDPBrowserStrategy): self: For method chaining """ # Initialize Playwright - from ..utils import get_playwright - - self.playwright = await get_playwright() + await BaseBrowserStrategy.start(self) if self.logger: self.logger.info( @@ -172,121 +170,6 @@ class DockerBrowserStrategy(CDPBrowserStrategy): # Use the utility method to generate the hash return self.docker_utils.generate_config_hash(config_dict) - async def _get_or_create_cdp_url1(self) -> str: - """Get CDP URL by either creating a new container or using an existing one. - - Returns: - CDP URL for connecting to the browser - - Raises: - Exception: If container creation or browser launch fails - """ - # If CDP URL is explicitly provided, use it - if self.config.cdp_url: - return self.config.cdp_url - - # Ensure Docker image exists (will build if needed) - image_name = await self.docker_utils.ensure_docker_image_exists( - self.docker_config.image, self.docker_config.mode - ) - - # Generate config hash for container matching - config_hash = await self._generate_config_hash() - - # Look for existing container with matching config - container_id = self.registry.find_container_by_config( - config_hash, self.docker_utils - ) - - if container_id: - # Use existing container - self.container_id = container_id - host_port = self.registry.get_container_host_port(container_id) - if self.logger: - self.logger.info( - f"Using existing Docker container: {container_id[:12]}", - tag="DOCKER", - ) - else: - # Get a port for the new container - host_port = ( - self.docker_config.host_port - or self.registry.get_next_available_port(self.docker_utils) - ) - - # Prepare volumes list - volumes = list(self.docker_config.volumes) - - # Add user data directory if specified - if self.docker_config.user_data_dir: - # Ensure user data directory exists - os.makedirs(self.docker_config.user_data_dir, exist_ok=True) - volumes.append( - f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}" - ) - - # Update config user_data_dir to point to container path - self.config.user_data_dir = self.docker_config.container_user_data_dir - - # Create a new container - container_id = await self.docker_utils.create_container( - image_name=image_name, - host_port=host_port, - container_name=self.container_name, - volumes=volumes, - network=self.docker_config.network, - env_vars=self.docker_config.env_vars, - extra_args=self.docker_config.extra_args, - ) - - if not container_id: - raise Exception("Failed to create Docker container") - - self.container_id = container_id - - # Register the container - self.registry.register_container(container_id, host_port, config_hash) - - # Wait for container to be ready - await self.docker_utils.wait_for_container_ready(container_id) - - # Handle specific setup based on mode - if self.docker_config.mode == "launch": - # In launch mode, we need to start socat and Chrome - await self.docker_utils.start_socat_in_container(container_id) - - # Build browser arguments - browser_args = self._build_browser_args() - - # Launch Chrome - await self.docker_utils.launch_chrome_in_container( - container_id, browser_args - ) - - # Get PIDs for later cleanup - self.chrome_process_id = ( - await self.docker_utils.get_process_id_in_container( - container_id, "chrome" - ) - ) - self.socat_process_id = ( - await self.docker_utils.get_process_id_in_container( - container_id, "socat" - ) - ) - - # Wait for CDP to be ready - await self.docker_utils.wait_for_cdp_ready(host_port) - - if self.logger: - self.logger.success( - f"Docker container ready: {container_id[:12]} on port {host_port}", - tag="DOCKER", - ) - - # Return CDP URL - return f"http://localhost:{host_port}" - async def _get_or_create_cdp_url(self) -> str: """Get CDP URL by either creating a new container or using an existing one. @@ -465,8 +348,7 @@ class DockerBrowserStrategy(CDPBrowserStrategy): async def close(self): """Close the browser and clean up Docker container if needed.""" # Set flag to track if we were the ones initiating shutdown - initiated_shutdown = not getattr(self, "shutting_down", False) - + initiated_shutdown = not self.shutting_down # Storage persistence for Docker needs special handling # We need to store state before calling super().close() which will close the browser if ( diff --git a/crawl4ai/browser/strategies/playwright.py b/crawl4ai/browser/strategies/playwright.py index b24edf72..bea99753 100644 --- a/crawl4ai/browser/strategies/playwright.py +++ b/crawl4ai/browser/strategies/playwright.py @@ -80,8 +80,26 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): raise return self - - async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + + async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + # Otherwise, check if we have an existing context for this config + config_signature = self._make_config_signature(crawlerRunConfig) + + async with self._contexts_lock: + if config_signature in self.contexts_by_config: + context = self.contexts_by_config[config_signature] + else: + # Create and setup a new context + context = await self.create_browser_context(crawlerRunConfig) + await self.setup_context(context, crawlerRunConfig) + self.contexts_by_config[config_signature] = context + + # Create a new page from the chosen context + page = await context.new_page() + + return page, context + + async def _get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: """Get a page for the given configuration. Args: @@ -90,15 +108,8 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): Returns: Tuple of (Page, BrowserContext) """ - # Clean up expired sessions first - self._cleanup_expired_sessions() - - # If a session_id is provided and we already have it, reuse that page + context - if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: - context, page, _ = self.sessions[crawlerRunConfig.session_id] - # Update last-used timestamp - self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) - return page, context + # Call parent method to ensure browser is started + await super().get_page(crawlerRunConfig) # Otherwise, check if we have an existing context for this config config_signature = self._make_config_signature(crawlerRunConfig) @@ -121,8 +132,3 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): return page, context - async def close(self): - """Close the Playwright browser and clean up resources.""" - # The base implementation already handles everything needed for Playwright - # including storage persistence, sessions, contexts, browser and playwright - await super().close() \ No newline at end of file diff --git a/crawl4ai/models.py b/crawl4ai/models.py index f9551c1a..aad14a1d 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,5 +1,7 @@ from pydantic import BaseModel, HttpUrl, PrivateAttr from typing import List, Dict, Optional, Callable, Awaitable, Union, Any +from typing import AsyncGenerator +from typing import Generic, TypeVar from enum import Enum from dataclasses import dataclass from .ssl_certificate import SSLCertificate @@ -34,34 +36,12 @@ class CrawlerTaskResult: def success(self) -> bool: return self.result.success - class CrawlStatus(Enum): QUEUED = "QUEUED" IN_PROGRESS = "IN_PROGRESS" COMPLETED = "COMPLETED" FAILED = "FAILED" - -# @dataclass -# class CrawlStats: -# task_id: str -# url: str -# status: CrawlStatus -# start_time: Optional[datetime] = None -# end_time: Optional[datetime] = None -# memory_usage: float = 0.0 -# peak_memory: float = 0.0 -# error_message: str = "" - -# @property -# def duration(self) -> str: -# if not self.start_time: -# return "0:00" -# end = self.end_time or datetime.now() -# duration = end - self.start_time -# return str(timedelta(seconds=int(duration.total_seconds()))) - - @dataclass class CrawlStats: task_id: str @@ -95,7 +75,6 @@ class CrawlStats: duration = end - start return str(timedelta(seconds=int(duration.total_seconds()))) - class DisplayMode(Enum): DETAILED = "DETAILED" AGGREGATED = "AGGREGATED" @@ -112,12 +91,10 @@ class TokenUsage: completion_tokens_details: Optional[dict] = None prompt_tokens_details: Optional[dict] = None - class UrlModel(BaseModel): url: HttpUrl forced: bool = False - class MarkdownGenerationResult(BaseModel): raw_markdown: str markdown_with_citations: str @@ -284,6 +261,40 @@ class StringCompatibleMarkdown(str): def __getattr__(self, name): return getattr(self._markdown_result, name) +CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) + +class CrawlResultContainer(Generic[CrawlResultT]): + def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]): + # Normalize to a list + if isinstance(results, list): + self._results = results + else: + self._results = [results] + + def __iter__(self): + return iter(self._results) + + def __getitem__(self, index): + return self._results[index] + + def __len__(self): + return len(self._results) + + def __getattr__(self, attr): + # Delegate attribute access to the first element. + if self._results: + return getattr(self._results[0], attr) + raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'") + + def __repr__(self): + return f"{self.__class__.__name__}({self._results!r})" + +RunManyReturn = Union[ + CrawlResultContainer[CrawlResultT], + AsyncGenerator[CrawlResultT, None] +] + + # END of backward compatibility code for markdown/markdown_v2. # When removing this code in the future, make sure to: # 1. Replace the private attribute and property with a standard field @@ -304,7 +315,6 @@ class AsyncCrawlResponse(BaseModel): class Config: arbitrary_types_allowed = True - ############################### # Scraping Models ############################### diff --git a/tests/browser/docker/test_docker_browser.py b/tests/browser/docker/test_docker_browser.py index cde5df23..2ec64a6b 100644 --- a/tests/browser/docker/test_docker_browser.py +++ b/tests/browser/docker/test_docker_browser.py @@ -530,7 +530,7 @@ async def test_docker_registry_reuse(): logger.info("First browser started successfully", tag="TEST") # Get container ID from the strategy - docker_strategy1 = manager1._strategy + docker_strategy1 = manager1.strategy container_id1 = docker_strategy1.container_id logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST") @@ -560,7 +560,7 @@ async def test_docker_registry_reuse(): logger.info("Second browser started successfully", tag="TEST") # Get container ID from the second strategy - docker_strategy2 = manager2._strategy + docker_strategy2 = manager2.strategy container_id2 = docker_strategy2.container_id logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST") diff --git a/tests/browser/test_builtin_browser.py b/tests/browser/test_builtin_browser.py index 0735e457..4797648c 100644 --- a/tests/browser/test_builtin_browser.py +++ b/tests/browser/test_builtin_browser.py @@ -56,13 +56,13 @@ async def test_builtin_browser_creation(): # Step 2: Check if we have a BuiltinBrowserStrategy print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}") - if isinstance(manager._strategy, BuiltinBrowserStrategy): + if isinstance(manager.strategy, BuiltinBrowserStrategy): print( - f"{SUCCESS}Correct strategy type: {manager._strategy.__class__.__name__}{RESET}" + f"{SUCCESS}Correct strategy type: {manager.strategy.__class__.__name__}{RESET}" ) else: print( - f"{ERROR}Wrong strategy type: {manager._strategy.__class__.__name__}{RESET}" + f"{ERROR}Wrong strategy type: {manager.strategy.__class__.__name__}{RESET}" ) return None @@ -77,7 +77,7 @@ async def test_builtin_browser_creation(): # Step 4: Get browser info from the strategy print(f"\n{INFO}4. Getting browser information{RESET}") - browser_info = manager._strategy.get_browser_info() + browser_info = manager.strategy.get_browser_info() if browser_info: print(f"{SUCCESS}Browser info retrieved:{RESET}") for key, value in browser_info.items(): @@ -149,7 +149,7 @@ async def test_browser_status_management(manager: BrowserManager): # Step 1: Get browser status print(f"\n{INFO}1. Getting browser status{RESET}") try: - status = await manager._strategy.get_builtin_browser_status() + status = await manager.strategy.get_builtin_browser_status() print(f"{SUCCESS}Browser status:{RESET}") print(f" Running: {status['running']}") print(f" CDP URL: {status['cdp_url']}") @@ -160,7 +160,7 @@ async def test_browser_status_management(manager: BrowserManager): # Step 2: Test killing the browser print(f"\n{INFO}2. Testing killing the browser{RESET}") try: - result = await manager._strategy.kill_builtin_browser() + result = await manager.strategy.kill_builtin_browser() if result: print(f"{SUCCESS}Browser killed successfully{RESET}") else: @@ -172,7 +172,7 @@ async def test_browser_status_management(manager: BrowserManager): # Step 3: Check status after kill print(f"\n{INFO}3. Checking status after kill{RESET}") try: - status = await manager._strategy.get_builtin_browser_status() + status = await manager.strategy.get_builtin_browser_status() if not status["running"]: print(f"{SUCCESS}Browser is correctly reported as not running{RESET}") else: @@ -184,7 +184,7 @@ async def test_browser_status_management(manager: BrowserManager): # Step 4: Launch a new browser print(f"\n{INFO}4. Launching a new browser{RESET}") try: - cdp_url = await manager._strategy.launch_builtin_browser( + cdp_url = await manager.strategy.launch_builtin_browser( browser_type="chromium", headless=True ) if cdp_url: @@ -223,8 +223,8 @@ async def test_multiple_managers(): print(f"{SUCCESS}Second manager started{RESET}") # Check if they got the same CDP URL - cdp_url1 = manager1._strategy.config.cdp_url - cdp_url2 = manager2._strategy.config.cdp_url + cdp_url1 = manager1.strategy.config.cdp_url + cdp_url2 = manager2.strategy.config.cdp_url if cdp_url1 == cdp_url2: print( @@ -316,7 +316,7 @@ async def test_edge_cases(): # Kill the browser directly print(f"{INFO}Killing the browser...{RESET}") - await manager._strategy.kill_builtin_browser() + await manager.strategy.kill_builtin_browser() print(f"{SUCCESS}Browser killed{RESET}") # Try to get a page (should fail or launch a new browser) @@ -350,7 +350,7 @@ async def cleanup_browsers(): try: # No need to start, just access the strategy directly - strategy = manager._strategy + strategy = manager.strategy if isinstance(strategy, BuiltinBrowserStrategy): result = await strategy.kill_builtin_browser() if result: @@ -420,7 +420,7 @@ async def test_performance_scaling(): user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"), ) manager = BrowserManager(browser_config=browser_config, logger=logger) - manager._strategy.shutting_down = True + manager.strategy.shutting_down = True manager_configs.append((manager, i, port)) # Define async function to start a single manager @@ -614,7 +614,7 @@ async def test_performance_scaling_lab( num_browsers: int = 10, pages_per_browse user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"), ) manager = BrowserManager(browser_config=browser_config, logger=logger) - manager._strategy.shutting_down = True + manager.strategy.shutting_down = True manager_configs.append((manager, i, port)) # Define async function to start a single manager