diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ef49dd3..fea79456 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,30 @@ All notable changes to Crawl4AI will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +### [Feature] 2025-04-21 +- Implemented MCP protocol for machine-to-machine communication + - Added WebSocket and SSE transport for MCP server + - Exposed server endpoints via MCP protocol + - Created tests for MCP socket and SSE communication +- Enhanced Docker server with file handling and intelligent search + - Added PDF and screenshot endpoints with file saving capability + - Added JavaScript execution endpoint for page interaction + - Implemented advanced context search with BM25 and code chunking + - Added file path output support for generated assets +- Improved server endpoints and API surface + - Added intelligent context search with query filtering + - Added syntax-aware code function chunking + - Implemented efficient HTML processing pipeline + +### [Refactor] 2025-04-20 +- Replaced crawler_manager.py with simpler crawler_pool.py implementation +- Added global page semaphore for hard concurrency cap +- Implemented browser pool with idle cleanup +- Added playground UI for testing and stress testing +- Updated API handlers to use pooled crawlers +- Enhanced logging levels and symbols +- Added memory tests and stress test utilities + ### [Added] 2025-04-17 - Added content source selection feature for markdown generation - New `content_source` parameter allows choosing between `cleaned_html`, `raw_html`, and `fit_html` diff --git a/deploy/docker/c4ai-code-context.md b/deploy/docker/c4ai-code-context.md new file mode 100644 index 00000000..f2551c01 --- /dev/null +++ b/deploy/docker/c4ai-code-context.md @@ -0,0 +1,11631 @@ +# Crawl4AI Code Context + +Generated on 2025-04-21 + +## File: crawl4ai/async_configs.py + +```py +import os +from .config import ( + DEFAULT_PROVIDER, + DEFAULT_PROVIDER_API_KEY, + MIN_WORD_THRESHOLD, + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + PROVIDER_MODELS, + PROVIDER_MODELS_PREFIXES, + SCREENSHOT_HEIGHT_TRESHOLD, + PAGE_TIMEOUT, + IMAGE_SCORE_THRESHOLD, + SOCIAL_MEDIA_DOMAINS, +) + +from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator +from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy +from .chunking_strategy import ChunkingStrategy, RegexChunking + +from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator +from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy +from .deep_crawling import DeepCrawlStrategy + +from .cache_context import CacheMode +from .proxy_strategy import ProxyRotationStrategy + +from typing import Union, List +import inspect +from typing import Any, Dict, Optional +from enum import Enum + +# from .proxy_strategy import ProxyConfig + + + +def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: + """ + Recursively convert an object to a serializable dictionary using {type, params} structure + for complex objects. + """ + if obj is None: + return None + + # Handle basic types + if isinstance(obj, (str, int, float, bool)): + return obj + + # Handle Enum + if isinstance(obj, Enum): + return {"type": obj.__class__.__name__, "params": obj.value} + + # Handle datetime objects + if hasattr(obj, "isoformat"): + return obj.isoformat() + + # Handle lists, tuples, and sets, and basically any iterable + if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__') and not isinstance(obj, dict): + return [to_serializable_dict(item) for item in obj] + + # Handle frozensets, which are not iterable + if isinstance(obj, frozenset): + return [to_serializable_dict(item) for item in list(obj)] + + # Handle dictionaries - preserve them as-is + if isinstance(obj, dict): + return { + "type": "dict", # Mark as plain dictionary + "value": {str(k): to_serializable_dict(v) for k, v in obj.items()}, + } + + _type = obj.__class__.__name__ + + # Handle class instances + if hasattr(obj, "__class__"): + # Get constructor signature + sig = inspect.signature(obj.__class__.__init__) + params = sig.parameters + + # Get current values + current_values = {} + for name, param in params.items(): + if name == "self": + continue + + value = getattr(obj, name, param.default) + + # Only include if different from default, considering empty values + if not (is_empty_value(value) and is_empty_value(param.default)): + if value != param.default and not ignore_default_value: + current_values[name] = to_serializable_dict(value) + + if hasattr(obj, '__slots__'): + for slot in obj.__slots__: + if slot.startswith('_'): # Handle private slots + attr_name = slot[1:] # Remove leading '_' + value = getattr(obj, slot, None) + if value is not None: + current_values[attr_name] = to_serializable_dict(value) + + + + return { + "type": obj.__class__.__name__, + "params": current_values + } + + return str(obj) + + +def from_serializable_dict(data: Any) -> Any: + """ + Recursively convert a serializable dictionary back to an object instance. + """ + if data is None: + return None + + # Handle basic types + if isinstance(data, (str, int, float, bool)): + return data + + # Handle typed data + if isinstance(data, dict) and "type" in data: + # Handle plain dictionaries + if data["type"] == "dict" and "value" in data: + return {k: from_serializable_dict(v) for k, v in data["value"].items()} + + # Import from crawl4ai for class instances + import crawl4ai + + if hasattr(crawl4ai, data["type"]): + cls = getattr(crawl4ai, data["type"]) + + # Handle Enum + if issubclass(cls, Enum): + return cls(data["params"]) + + if "params" in data: + # Handle class instances + constructor_args = { + k: from_serializable_dict(v) for k, v in data["params"].items() + } + return cls(**constructor_args) + + # Handle lists + if isinstance(data, list): + return [from_serializable_dict(item) for item in data] + + # Handle raw dictionaries (legacy support) + if isinstance(data, dict): + return {k: from_serializable_dict(v) for k, v in data.items()} + + return data + + +def is_empty_value(value: Any) -> bool: + """Check if a value is effectively empty/null.""" + if value is None: + return True + if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0: + return True + return False + +class ProxyConfig: + def __init__( + self, + server: str, + username: Optional[str] = None, + password: Optional[str] = None, + ip: Optional[str] = None, + ): + """Configuration class for a single proxy. + + Args: + server: Proxy server URL (e.g., "http://127.0.0.1:8080") + username: Optional username for proxy authentication + password: Optional password for proxy authentication + ip: Optional IP address for verification purposes + """ + self.server = server + self.username = username + self.password = password + + # Extract IP from server if not explicitly provided + self.ip = ip or self._extract_ip_from_server() + + def _extract_ip_from_server(self) -> Optional[str]: + """Extract IP address from server URL.""" + try: + # Simple extraction assuming http://ip:port format + if "://" in self.server: + parts = self.server.split("://")[1].split(":") + return parts[0] + else: + parts = self.server.split(":") + return parts[0] + except Exception: + return None + + @staticmethod + def from_string(proxy_str: str) -> "ProxyConfig": + """Create a ProxyConfig from a string in the format 'ip:port:username:password'.""" + parts = proxy_str.split(":") + if len(parts) == 4: # ip:port:username:password + ip, port, username, password = parts + return ProxyConfig( + server=f"http://{ip}:{port}", + username=username, + password=password, + ip=ip + ) + elif len(parts) == 2: # ip:port only + ip, port = parts + return ProxyConfig( + server=f"http://{ip}:{port}", + ip=ip + ) + else: + raise ValueError(f"Invalid proxy string format: {proxy_str}") + + @staticmethod + def from_dict(proxy_dict: Dict) -> "ProxyConfig": + """Create a ProxyConfig from a dictionary.""" + return ProxyConfig( + server=proxy_dict.get("server"), + username=proxy_dict.get("username"), + password=proxy_dict.get("password"), + ip=proxy_dict.get("ip") + ) + + @staticmethod + def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]: + """Load proxies from environment variable. + + Args: + env_var: Name of environment variable containing comma-separated proxy strings + + Returns: + List of ProxyConfig objects + """ + proxies = [] + try: + proxy_list = os.getenv(env_var, "").split(",") + for proxy in proxy_list: + if not proxy: + continue + proxies.append(ProxyConfig.from_string(proxy)) + except Exception as e: + print(f"Error loading proxies from environment: {e}") + return proxies + + def to_dict(self) -> Dict: + """Convert to dictionary representation.""" + return { + "server": self.server, + "username": self.username, + "password": self.password, + "ip": self.ip + } + + def clone(self, **kwargs) -> "ProxyConfig": + """Create a copy of this configuration with updated values. + + Args: + **kwargs: Key-value pairs of configuration options to update + + Returns: + ProxyConfig: A new instance with the specified updates + """ + config_dict = self.to_dict() + config_dict.update(kwargs) + return ProxyConfig.from_dict(config_dict) + + + +class BrowserConfig: + """ + Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. + + This class centralizes all parameters that affect browser and context creation. Instead of passing + scattered keyword arguments, users can instantiate and modify this configuration object. The crawler + code will then reference these settings to initialize the browser in a consistent, documented manner. + + Attributes: + browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". + Default: "chromium". + headless (bool): Whether to run the browser in headless mode (no visible GUI). + Default: True. + browser_mode (str): Determines how the browser should be initialized: + "builtin" - use the builtin CDP browser running in background + "dedicated" - create a new dedicated browser instance each time + "cdp" - use explicit CDP settings provided in cdp_url + "docker" - run browser in Docker container with isolation + Default: "dedicated" + use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing + advanced manipulation. Default: False. + cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/". + debugging_port (int): Port for the browser debugging protocol. Default: 9222. + use_persistent_context (bool): Use a persistent browser context (like a persistent profile). + Automatically sets use_managed_browser=True. Default: False. + user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a + temporary directory may be used. Default: None. + chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type + is "chromium". Default: "chromium". + channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type + is "chromium". Default: "chromium". + proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used. + Default: None. + proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. + If None, no additional proxy config. Default: None. + viewport_width (int): Default viewport width for pages. Default: 1080. + viewport_height (int): Default viewport height for pages. Default: 600. + viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height. + Default: None. + verbose (bool): Enable verbose logging. + Default: True. + accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path. + Default: False. + downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True, + a default path will be created. Default: None. + storage_state (str or dict or None): An in-memory storage state (cookies, localStorage). + Default: None. + ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True. + java_script_enabled (bool): Enable JavaScript execution in pages. Default: True. + cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like + {"name": "...", "value": "...", "url": "..."}. + Default: []. + headers (dict): Extra HTTP headers to apply to all requests in this context. + Default: {}. + user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36". + user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided + user_agent as-is. Default: None. + user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. + Default: None. + text_mode (bool): If True, disables images and other rich content for potentially faster load times. + Default: False. + light_mode (bool): Disables certain background features for performance gains. Default: False. + extra_args (list): Additional command-line arguments passed to the browser. + Default: []. + """ + + def __init__( + self, + browser_type: str = "chromium", + headless: bool = True, + browser_mode: str = "dedicated", + use_managed_browser: bool = False, + cdp_url: str = None, + use_persistent_context: bool = False, + user_data_dir: str = None, + chrome_channel: str = "chromium", + channel: str = "chromium", + proxy: str = None, + proxy_config: Union[ProxyConfig, dict, None] = None, + viewport_width: int = 1080, + viewport_height: int = 600, + viewport: dict = None, + accept_downloads: bool = False, + downloads_path: str = None, + storage_state: Union[str, dict, None] = None, + ignore_https_errors: bool = True, + java_script_enabled: bool = True, + sleep_on_close: bool = False, + verbose: bool = True, + cookies: list = None, + headers: dict = None, + user_agent: str = ( + # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " + # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + # "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36" + ), + user_agent_mode: str = "", + user_agent_generator_config: dict = {}, + text_mode: bool = False, + light_mode: bool = False, + extra_args: list = None, + debugging_port: int = 9222, + host: str = "localhost", + ): + self.browser_type = browser_type + self.headless = headless or True + self.browser_mode = browser_mode + self.use_managed_browser = use_managed_browser + self.cdp_url = cdp_url + self.use_persistent_context = use_persistent_context + self.user_data_dir = user_data_dir + self.chrome_channel = chrome_channel or self.browser_type or "chromium" + self.channel = channel or self.browser_type or "chromium" + if self.browser_type in ["firefox", "webkit"]: + self.channel = "" + self.chrome_channel = "" + self.proxy = proxy + self.proxy_config = proxy_config + + + self.viewport_width = viewport_width + self.viewport_height = viewport_height + self.viewport = viewport + if self.viewport is not None: + self.viewport_width = self.viewport.get("width", 1080) + self.viewport_height = self.viewport.get("height", 600) + self.accept_downloads = accept_downloads + self.downloads_path = downloads_path + self.storage_state = storage_state + self.ignore_https_errors = ignore_https_errors + self.java_script_enabled = java_script_enabled + self.cookies = cookies if cookies is not None else [] + self.headers = headers if headers is not None else {} + self.user_agent = user_agent + self.user_agent_mode = user_agent_mode + self.user_agent_generator_config = user_agent_generator_config + self.text_mode = text_mode + self.light_mode = light_mode + self.extra_args = extra_args if extra_args is not None else [] + self.sleep_on_close = sleep_on_close + self.verbose = verbose + self.debugging_port = debugging_port + self.host = host + + fa_user_agenr_generator = ValidUAGenerator() + if self.user_agent_mode == "random": + self.user_agent = fa_user_agenr_generator.generate( + **(self.user_agent_generator_config or {}) + ) + else: + pass + + self.browser_hint = UAGen.generate_client_hints(self.user_agent) + self.headers.setdefault("sec-ch-ua", self.browser_hint) + + # Set appropriate browser management flags based on browser_mode + if self.browser_mode == "builtin": + # Builtin mode uses managed browser connecting to builtin CDP endpoint + self.use_managed_browser = True + # cdp_url will be set later by browser_manager + elif self.browser_mode == "docker": + # Docker mode uses managed browser with CDP to connect to browser in container + self.use_managed_browser = True + # cdp_url will be set later by docker browser strategy + elif self.browser_mode == "custom" and self.cdp_url: + # Custom mode with explicit CDP URL + self.use_managed_browser = True + elif self.browser_mode == "dedicated": + # Dedicated mode uses a new browser instance each time + pass + + # If persistent context is requested, ensure managed browser is enabled + if self.use_persistent_context: + self.use_managed_browser = True + + @staticmethod + def from_kwargs(kwargs: dict) -> "BrowserConfig": + return BrowserConfig( + browser_type=kwargs.get("browser_type", "chromium"), + headless=kwargs.get("headless", True), + browser_mode=kwargs.get("browser_mode", "dedicated"), + use_managed_browser=kwargs.get("use_managed_browser", False), + cdp_url=kwargs.get("cdp_url"), + use_persistent_context=kwargs.get("use_persistent_context", False), + user_data_dir=kwargs.get("user_data_dir"), + chrome_channel=kwargs.get("chrome_channel", "chromium"), + channel=kwargs.get("channel", "chromium"), + proxy=kwargs.get("proxy"), + proxy_config=kwargs.get("proxy_config", None), + viewport_width=kwargs.get("viewport_width", 1080), + viewport_height=kwargs.get("viewport_height", 600), + accept_downloads=kwargs.get("accept_downloads", False), + downloads_path=kwargs.get("downloads_path"), + storage_state=kwargs.get("storage_state"), + ignore_https_errors=kwargs.get("ignore_https_errors", True), + java_script_enabled=kwargs.get("java_script_enabled", True), + cookies=kwargs.get("cookies", []), + headers=kwargs.get("headers", {}), + user_agent=kwargs.get( + "user_agent", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", + ), + user_agent_mode=kwargs.get("user_agent_mode"), + user_agent_generator_config=kwargs.get("user_agent_generator_config"), + text_mode=kwargs.get("text_mode", False), + light_mode=kwargs.get("light_mode", False), + extra_args=kwargs.get("extra_args", []), + debugging_port=kwargs.get("debugging_port", 9222), + host=kwargs.get("host", "localhost"), + ) + + def to_dict(self): + result = { + "browser_type": self.browser_type, + "headless": self.headless, + "browser_mode": self.browser_mode, + "use_managed_browser": self.use_managed_browser, + "cdp_url": self.cdp_url, + "use_persistent_context": self.use_persistent_context, + "user_data_dir": self.user_data_dir, + "chrome_channel": self.chrome_channel, + "channel": self.channel, + "proxy": self.proxy, + "proxy_config": self.proxy_config, + "viewport_width": self.viewport_width, + "viewport_height": self.viewport_height, + "accept_downloads": self.accept_downloads, + "downloads_path": self.downloads_path, + "storage_state": self.storage_state, + "ignore_https_errors": self.ignore_https_errors, + "java_script_enabled": self.java_script_enabled, + "cookies": self.cookies, + "headers": self.headers, + "user_agent": self.user_agent, + "user_agent_mode": self.user_agent_mode, + "user_agent_generator_config": self.user_agent_generator_config, + "text_mode": self.text_mode, + "light_mode": self.light_mode, + "extra_args": self.extra_args, + "sleep_on_close": self.sleep_on_close, + "verbose": self.verbose, + "debugging_port": self.debugging_port, + "host": self.host, + } + + + return result + + def clone(self, **kwargs): + """Create a copy of this configuration with updated values. + + Args: + **kwargs: Key-value pairs of configuration options to update + + Returns: + BrowserConfig: A new instance with the specified updates + """ + config_dict = self.to_dict() + config_dict.update(kwargs) + return BrowserConfig.from_kwargs(config_dict) + + # Create a funciton returns dict of the object + def dump(self) -> dict: + # Serialize the object to a dictionary + return to_serializable_dict(self) + + @staticmethod + def load(data: dict) -> "BrowserConfig": + # Deserialize the object from a dictionary + config = from_serializable_dict(data) + if isinstance(config, BrowserConfig): + return config + return BrowserConfig.from_kwargs(config) + + +class HTTPCrawlerConfig: + """HTTP-specific crawler configuration""" + + method: str = "GET" + headers: Optional[Dict[str, str]] = None + data: Optional[Dict[str, Any]] = None + json: Optional[Dict[str, Any]] = None + follow_redirects: bool = True + verify_ssl: bool = True + + def __init__( + self, + method: str = "GET", + headers: Optional[Dict[str, str]] = None, + data: Optional[Dict[str, Any]] = None, + json: Optional[Dict[str, Any]] = None, + follow_redirects: bool = True, + verify_ssl: bool = True, + ): + self.method = method + self.headers = headers + self.data = data + self.json = json + self.follow_redirects = follow_redirects + self.verify_ssl = verify_ssl + + @staticmethod + def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig": + return HTTPCrawlerConfig( + method=kwargs.get("method", "GET"), + headers=kwargs.get("headers"), + data=kwargs.get("data"), + json=kwargs.get("json"), + follow_redirects=kwargs.get("follow_redirects", True), + verify_ssl=kwargs.get("verify_ssl", True), + ) + + def to_dict(self): + return { + "method": self.method, + "headers": self.headers, + "data": self.data, + "json": self.json, + "follow_redirects": self.follow_redirects, + "verify_ssl": self.verify_ssl, + } + + def clone(self, **kwargs): + """Create a copy of this configuration with updated values. + + Args: + **kwargs: Key-value pairs of configuration options to update + + Returns: + HTTPCrawlerConfig: A new instance with the specified updates + """ + config_dict = self.to_dict() + config_dict.update(kwargs) + return HTTPCrawlerConfig.from_kwargs(config_dict) + + def dump(self) -> dict: + return to_serializable_dict(self) + + @staticmethod + def load(data: dict) -> "HTTPCrawlerConfig": + config = from_serializable_dict(data) + if isinstance(config, HTTPCrawlerConfig): + return config + return HTTPCrawlerConfig.from_kwargs(config) + +class CrawlerRunConfig(): + _UNWANTED_PROPS = { + 'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED', + 'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS', + 'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY', + 'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY', + } + + """ + Configuration class for controlling how the crawler runs each crawl operation. + This includes parameters for content extraction, page manipulation, waiting conditions, + caching, and other runtime behaviors. + + This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods. + By using this class, you have a single place to understand and adjust the crawling options. + + Attributes: + # Deep Crawl Parameters + deep_crawl_strategy (DeepCrawlStrategy or None): Strategy to use for deep crawling. + + # Content Processing Parameters + word_count_threshold (int): Minimum word count threshold before processing content. + Default: MIN_WORD_THRESHOLD (typically 200). + extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages. + Default: None (NoExtractionStrategy is used if None). + chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction. + Default: RegexChunking(). + markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown. + Default: None. + only_text (bool): If True, attempt to extract text-only content where applicable. + Default: False. + css_selector (str or None): CSS selector to extract a specific portion of the page. + Default: None. + + target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation + and structured data extraction. When you set this, only the contents + of these elements are processed for extraction and Markdown generation. + If you do not set any value, the entire page is processed. + The difference between this and css_selector is that this will shrink + the initial raw HTML to the selected element, while this will only affect + the extraction and Markdown generation. + Default: None + excluded_tags (list of str or None): List of HTML tags to exclude from processing. + Default: None. + excluded_selector (str or None): CSS selector to exclude from processing. + Default: None. + keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes. + Default: False. + keep_attrs (list of str): List of HTML attributes to keep during processing. + Default: []. + remove_forms (bool): If True, remove all `