# Crawl4AI Code Context Generated on 2025-04-21 ## File: crawl4ai/async_configs.py ```py import os from .config import ( DEFAULT_PROVIDER, DEFAULT_PROVIDER_API_KEY, MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, PROVIDER_MODELS, PROVIDER_MODELS_PREFIXES, SCREENSHOT_HEIGHT_TRESHOLD, PAGE_TIMEOUT, IMAGE_SCORE_THRESHOLD, SOCIAL_MEDIA_DOMAINS, ) from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from .deep_crawling import DeepCrawlStrategy from .cache_context import CacheMode from .proxy_strategy import ProxyRotationStrategy from typing import Union, List import inspect from typing import Any, Dict, Optional from enum import Enum # from .proxy_strategy import ProxyConfig def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: """ Recursively convert an object to a serializable dictionary using {type, params} structure for complex objects. """ if obj is None: return None # Handle basic types if isinstance(obj, (str, int, float, bool)): return obj # Handle Enum if isinstance(obj, Enum): return {"type": obj.__class__.__name__, "params": obj.value} # Handle datetime objects if hasattr(obj, "isoformat"): return obj.isoformat() # Handle lists, tuples, and sets, and basically any iterable if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__') and not isinstance(obj, dict): return [to_serializable_dict(item) for item in obj] # Handle frozensets, which are not iterable if isinstance(obj, frozenset): return [to_serializable_dict(item) for item in list(obj)] # Handle dictionaries - preserve them as-is if isinstance(obj, dict): return { "type": "dict", # Mark as plain dictionary "value": {str(k): to_serializable_dict(v) for k, v in obj.items()}, } _type = obj.__class__.__name__ # Handle class instances if hasattr(obj, "__class__"): # Get constructor signature sig = inspect.signature(obj.__class__.__init__) params = sig.parameters # Get current values current_values = {} for name, param in params.items(): if name == "self": continue value = getattr(obj, name, param.default) # Only include if different from default, considering empty values if not (is_empty_value(value) and is_empty_value(param.default)): if value != param.default and not ignore_default_value: current_values[name] = to_serializable_dict(value) if hasattr(obj, '__slots__'): for slot in obj.__slots__: if slot.startswith('_'): # Handle private slots attr_name = slot[1:] # Remove leading '_' value = getattr(obj, slot, None) if value is not None: current_values[attr_name] = to_serializable_dict(value) return { "type": obj.__class__.__name__, "params": current_values } return str(obj) def from_serializable_dict(data: Any) -> Any: """ Recursively convert a serializable dictionary back to an object instance. """ if data is None: return None # Handle basic types if isinstance(data, (str, int, float, bool)): return data # Handle typed data if isinstance(data, dict) and "type" in data: # Handle plain dictionaries if data["type"] == "dict" and "value" in data: return {k: from_serializable_dict(v) for k, v in data["value"].items()} # Import from crawl4ai for class instances import crawl4ai if hasattr(crawl4ai, data["type"]): cls = getattr(crawl4ai, data["type"]) # Handle Enum if issubclass(cls, Enum): return cls(data["params"]) if "params" in data: # Handle class instances constructor_args = { k: from_serializable_dict(v) for k, v in data["params"].items() } return cls(**constructor_args) # Handle lists if isinstance(data, list): return [from_serializable_dict(item) for item in data] # Handle raw dictionaries (legacy support) if isinstance(data, dict): return {k: from_serializable_dict(v) for k, v in data.items()} return data def is_empty_value(value: Any) -> bool: """Check if a value is effectively empty/null.""" if value is None: return True if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0: return True return False class ProxyConfig: def __init__( self, server: str, username: Optional[str] = None, password: Optional[str] = None, ip: Optional[str] = None, ): """Configuration class for a single proxy. Args: server: Proxy server URL (e.g., "http://127.0.0.1:8080") username: Optional username for proxy authentication password: Optional password for proxy authentication ip: Optional IP address for verification purposes """ self.server = server self.username = username self.password = password # Extract IP from server if not explicitly provided self.ip = ip or self._extract_ip_from_server() def _extract_ip_from_server(self) -> Optional[str]: """Extract IP address from server URL.""" try: # Simple extraction assuming http://ip:port format if "://" in self.server: parts = self.server.split("://")[1].split(":") return parts[0] else: parts = self.server.split(":") return parts[0] except Exception: return None @staticmethod def from_string(proxy_str: str) -> "ProxyConfig": """Create a ProxyConfig from a string in the format 'ip:port:username:password'.""" parts = proxy_str.split(":") if len(parts) == 4: # ip:port:username:password ip, port, username, password = parts return ProxyConfig( server=f"http://{ip}:{port}", username=username, password=password, ip=ip ) elif len(parts) == 2: # ip:port only ip, port = parts return ProxyConfig( server=f"http://{ip}:{port}", ip=ip ) else: raise ValueError(f"Invalid proxy string format: {proxy_str}") @staticmethod def from_dict(proxy_dict: Dict) -> "ProxyConfig": """Create a ProxyConfig from a dictionary.""" return ProxyConfig( server=proxy_dict.get("server"), username=proxy_dict.get("username"), password=proxy_dict.get("password"), ip=proxy_dict.get("ip") ) @staticmethod def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]: """Load proxies from environment variable. Args: env_var: Name of environment variable containing comma-separated proxy strings Returns: List of ProxyConfig objects """ proxies = [] try: proxy_list = os.getenv(env_var, "").split(",") for proxy in proxy_list: if not proxy: continue proxies.append(ProxyConfig.from_string(proxy)) except Exception as e: print(f"Error loading proxies from environment: {e}") return proxies def to_dict(self) -> Dict: """Convert to dictionary representation.""" return { "server": self.server, "username": self.username, "password": self.password, "ip": self.ip } def clone(self, **kwargs) -> "ProxyConfig": """Create a copy of this configuration with updated values. Args: **kwargs: Key-value pairs of configuration options to update Returns: ProxyConfig: A new instance with the specified updates """ config_dict = self.to_dict() config_dict.update(kwargs) return ProxyConfig.from_dict(config_dict) class BrowserConfig: """ Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. This class centralizes all parameters that affect browser and context creation. Instead of passing scattered keyword arguments, users can instantiate and modify this configuration object. The crawler code will then reference these settings to initialize the browser in a consistent, documented manner. Attributes: browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". Default: "chromium". headless (bool): Whether to run the browser in headless mode (no visible GUI). Default: True. browser_mode (str): Determines how the browser should be initialized: "builtin" - use the builtin CDP browser running in background "dedicated" - create a new dedicated browser instance each time "cdp" - use explicit CDP settings provided in cdp_url "docker" - run browser in Docker container with isolation Default: "dedicated" use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing advanced manipulation. Default: False. cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/". debugging_port (int): Port for the browser debugging protocol. Default: 9222. use_persistent_context (bool): Use a persistent browser context (like a persistent profile). Automatically sets use_managed_browser=True. Default: False. user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a temporary directory may be used. Default: None. chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type is "chromium". Default: "chromium". channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type is "chromium". Default: "chromium". proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used. Default: None. proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. If None, no additional proxy config. Default: None. viewport_width (int): Default viewport width for pages. Default: 1080. viewport_height (int): Default viewport height for pages. Default: 600. viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height. Default: None. verbose (bool): Enable verbose logging. Default: True. accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path. Default: False. downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True, a default path will be created. Default: None. storage_state (str or dict or None): An in-memory storage state (cookies, localStorage). Default: None. ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True. java_script_enabled (bool): Enable JavaScript execution in pages. Default: True. cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like {"name": "...", "value": "...", "url": "..."}. Default: []. headers (dict): Extra HTTP headers to apply to all requests in this context. Default: {}. user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36". user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is. Default: None. user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. Default: None. text_mode (bool): If True, disables images and other rich content for potentially faster load times. Default: False. light_mode (bool): Disables certain background features for performance gains. Default: False. extra_args (list): Additional command-line arguments passed to the browser. Default: []. """ def __init__( self, browser_type: str = "chromium", headless: bool = True, browser_mode: str = "dedicated", use_managed_browser: bool = False, cdp_url: str = None, use_persistent_context: bool = False, user_data_dir: str = None, chrome_channel: str = "chromium", channel: str = "chromium", proxy: str = None, proxy_config: Union[ProxyConfig, dict, None] = None, viewport_width: int = 1080, viewport_height: int = 600, viewport: dict = None, accept_downloads: bool = False, downloads_path: str = None, storage_state: Union[str, dict, None] = None, ignore_https_errors: bool = True, java_script_enabled: bool = True, sleep_on_close: bool = False, verbose: bool = True, cookies: list = None, headers: dict = None, user_agent: str = ( # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " # "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36" ), user_agent_mode: str = "", user_agent_generator_config: dict = {}, text_mode: bool = False, light_mode: bool = False, extra_args: list = None, debugging_port: int = 9222, host: str = "localhost", ): self.browser_type = browser_type self.headless = headless or True self.browser_mode = browser_mode self.use_managed_browser = use_managed_browser self.cdp_url = cdp_url self.use_persistent_context = use_persistent_context self.user_data_dir = user_data_dir self.chrome_channel = chrome_channel or self.browser_type or "chromium" self.channel = channel or self.browser_type or "chromium" if self.browser_type in ["firefox", "webkit"]: self.channel = "" self.chrome_channel = "" self.proxy = proxy self.proxy_config = proxy_config self.viewport_width = viewport_width self.viewport_height = viewport_height self.viewport = viewport if self.viewport is not None: self.viewport_width = self.viewport.get("width", 1080) self.viewport_height = self.viewport.get("height", 600) self.accept_downloads = accept_downloads self.downloads_path = downloads_path self.storage_state = storage_state self.ignore_https_errors = ignore_https_errors self.java_script_enabled = java_script_enabled self.cookies = cookies if cookies is not None else [] self.headers = headers if headers is not None else {} self.user_agent = user_agent self.user_agent_mode = user_agent_mode self.user_agent_generator_config = user_agent_generator_config self.text_mode = text_mode self.light_mode = light_mode self.extra_args = extra_args if extra_args is not None else [] self.sleep_on_close = sleep_on_close self.verbose = verbose self.debugging_port = debugging_port self.host = host fa_user_agenr_generator = ValidUAGenerator() if self.user_agent_mode == "random": self.user_agent = fa_user_agenr_generator.generate( **(self.user_agent_generator_config or {}) ) else: pass self.browser_hint = UAGen.generate_client_hints(self.user_agent) self.headers.setdefault("sec-ch-ua", self.browser_hint) # Set appropriate browser management flags based on browser_mode if self.browser_mode == "builtin": # Builtin mode uses managed browser connecting to builtin CDP endpoint self.use_managed_browser = True # cdp_url will be set later by browser_manager elif self.browser_mode == "docker": # Docker mode uses managed browser with CDP to connect to browser in container self.use_managed_browser = True # cdp_url will be set later by docker browser strategy elif self.browser_mode == "custom" and self.cdp_url: # Custom mode with explicit CDP URL self.use_managed_browser = True elif self.browser_mode == "dedicated": # Dedicated mode uses a new browser instance each time pass # If persistent context is requested, ensure managed browser is enabled if self.use_persistent_context: self.use_managed_browser = True @staticmethod def from_kwargs(kwargs: dict) -> "BrowserConfig": return BrowserConfig( browser_type=kwargs.get("browser_type", "chromium"), headless=kwargs.get("headless", True), browser_mode=kwargs.get("browser_mode", "dedicated"), use_managed_browser=kwargs.get("use_managed_browser", False), cdp_url=kwargs.get("cdp_url"), use_persistent_context=kwargs.get("use_persistent_context", False), user_data_dir=kwargs.get("user_data_dir"), chrome_channel=kwargs.get("chrome_channel", "chromium"), channel=kwargs.get("channel", "chromium"), proxy=kwargs.get("proxy"), proxy_config=kwargs.get("proxy_config", None), viewport_width=kwargs.get("viewport_width", 1080), viewport_height=kwargs.get("viewport_height", 600), accept_downloads=kwargs.get("accept_downloads", False), downloads_path=kwargs.get("downloads_path"), storage_state=kwargs.get("storage_state"), ignore_https_errors=kwargs.get("ignore_https_errors", True), java_script_enabled=kwargs.get("java_script_enabled", True), cookies=kwargs.get("cookies", []), headers=kwargs.get("headers", {}), user_agent=kwargs.get( "user_agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", ), user_agent_mode=kwargs.get("user_agent_mode"), user_agent_generator_config=kwargs.get("user_agent_generator_config"), text_mode=kwargs.get("text_mode", False), light_mode=kwargs.get("light_mode", False), extra_args=kwargs.get("extra_args", []), debugging_port=kwargs.get("debugging_port", 9222), host=kwargs.get("host", "localhost"), ) def to_dict(self): result = { "browser_type": self.browser_type, "headless": self.headless, "browser_mode": self.browser_mode, "use_managed_browser": self.use_managed_browser, "cdp_url": self.cdp_url, "use_persistent_context": self.use_persistent_context, "user_data_dir": self.user_data_dir, "chrome_channel": self.chrome_channel, "channel": self.channel, "proxy": self.proxy, "proxy_config": self.proxy_config, "viewport_width": self.viewport_width, "viewport_height": self.viewport_height, "accept_downloads": self.accept_downloads, "downloads_path": self.downloads_path, "storage_state": self.storage_state, "ignore_https_errors": self.ignore_https_errors, "java_script_enabled": self.java_script_enabled, "cookies": self.cookies, "headers": self.headers, "user_agent": self.user_agent, "user_agent_mode": self.user_agent_mode, "user_agent_generator_config": self.user_agent_generator_config, "text_mode": self.text_mode, "light_mode": self.light_mode, "extra_args": self.extra_args, "sleep_on_close": self.sleep_on_close, "verbose": self.verbose, "debugging_port": self.debugging_port, "host": self.host, } return result def clone(self, **kwargs): """Create a copy of this configuration with updated values. Args: **kwargs: Key-value pairs of configuration options to update Returns: BrowserConfig: A new instance with the specified updates """ config_dict = self.to_dict() config_dict.update(kwargs) return BrowserConfig.from_kwargs(config_dict) # Create a funciton returns dict of the object def dump(self) -> dict: # Serialize the object to a dictionary return to_serializable_dict(self) @staticmethod def load(data: dict) -> "BrowserConfig": # Deserialize the object from a dictionary config = from_serializable_dict(data) if isinstance(config, BrowserConfig): return config return BrowserConfig.from_kwargs(config) class HTTPCrawlerConfig: """HTTP-specific crawler configuration""" method: str = "GET" headers: Optional[Dict[str, str]] = None data: Optional[Dict[str, Any]] = None json: Optional[Dict[str, Any]] = None follow_redirects: bool = True verify_ssl: bool = True def __init__( self, method: str = "GET", headers: Optional[Dict[str, str]] = None, data: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, follow_redirects: bool = True, verify_ssl: bool = True, ): self.method = method self.headers = headers self.data = data self.json = json self.follow_redirects = follow_redirects self.verify_ssl = verify_ssl @staticmethod def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig": return HTTPCrawlerConfig( method=kwargs.get("method", "GET"), headers=kwargs.get("headers"), data=kwargs.get("data"), json=kwargs.get("json"), follow_redirects=kwargs.get("follow_redirects", True), verify_ssl=kwargs.get("verify_ssl", True), ) def to_dict(self): return { "method": self.method, "headers": self.headers, "data": self.data, "json": self.json, "follow_redirects": self.follow_redirects, "verify_ssl": self.verify_ssl, } def clone(self, **kwargs): """Create a copy of this configuration with updated values. Args: **kwargs: Key-value pairs of configuration options to update Returns: HTTPCrawlerConfig: A new instance with the specified updates """ config_dict = self.to_dict() config_dict.update(kwargs) return HTTPCrawlerConfig.from_kwargs(config_dict) def dump(self) -> dict: return to_serializable_dict(self) @staticmethod def load(data: dict) -> "HTTPCrawlerConfig": config = from_serializable_dict(data) if isinstance(config, HTTPCrawlerConfig): return config return HTTPCrawlerConfig.from_kwargs(config) class CrawlerRunConfig(): _UNWANTED_PROPS = { 'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED', 'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS', 'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY', 'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY', } """ Configuration class for controlling how the crawler runs each crawl operation. This includes parameters for content extraction, page manipulation, waiting conditions, caching, and other runtime behaviors. This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods. By using this class, you have a single place to understand and adjust the crawling options. Attributes: # Deep Crawl Parameters deep_crawl_strategy (DeepCrawlStrategy or None): Strategy to use for deep crawling. # Content Processing Parameters word_count_threshold (int): Minimum word count threshold before processing content. Default: MIN_WORD_THRESHOLD (typically 200). extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages. Default: None (NoExtractionStrategy is used if None). chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction. Default: RegexChunking(). markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown. Default: None. only_text (bool): If True, attempt to extract text-only content where applicable. Default: False. css_selector (str or None): CSS selector to extract a specific portion of the page. Default: None. target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation and structured data extraction. When you set this, only the contents of these elements are processed for extraction and Markdown generation. If you do not set any value, the entire page is processed. The difference between this and css_selector is that this will shrink the initial raw HTML to the selected element, while this will only affect the extraction and Markdown generation. Default: None excluded_tags (list of str or None): List of HTML tags to exclude from processing. Default: None. excluded_selector (str or None): CSS selector to exclude from processing. Default: None. keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes. Default: False. keep_attrs (list of str): List of HTML attributes to keep during processing. Default: []. remove_forms (bool): If True, remove all `