diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index ed752252..13410c4f 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -2,7 +2,7 @@ import warnings from .async_webcrawler import AsyncWebCrawler, CacheMode -from .async_configs import BrowserConfig, CrawlerRunConfig +from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig from .content_scraping_strategy import ( ContentScrapingStrategy, WebScrapingStrategy, @@ -70,6 +70,7 @@ __all__ = [ "LXMLWebScrapingStrategy", "BrowserConfig", "CrawlerRunConfig", + "HTTPCrawlerConfig", "ExtractionStrategy", "LLMExtractionStrategy", "CosineStrategy", diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index e43b1394..10b122dd 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,5 +1,5 @@ -import re -from attr import has +from email import header +from re import I from .config import ( MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, @@ -19,7 +19,6 @@ from typing import Union, List from .cache_context import CacheMode from .proxy_strategy import ProxyRotationStrategy - import inspect from typing import Any, Dict, Optional from enum import Enum @@ -47,8 +46,8 @@ def to_serializable_dict(obj: Any) -> Dict: if hasattr(obj, 'isoformat'): return obj.isoformat() - # Handle lists, tuples, and sets - if isinstance(obj, (list, tuple, set)): + # Handle lists, tuples, and sets, and basically any iterable + if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__'): return [to_serializable_dict(item) for item in obj] # Handle frozensets, which are not iterable @@ -67,7 +66,6 @@ def to_serializable_dict(obj: Any) -> Dict: # Get constructor signature sig = inspect.signature(obj.__class__.__init__) params = sig.parameters - _type = obj.__class__.__name__ # Get current values current_values = {} @@ -81,24 +79,8 @@ def to_serializable_dict(obj: Any) -> Dict: if not (is_empty_value(value) and is_empty_value(param.default)): if value != param.default: current_values[name] = to_serializable_dict(value) - elif hasattr(obj.__class__, '__slots__') and f"_{name}" in obj.__slots__: - slot = f"_{name}" - slot_value = getattr(obj, slot, None) - if not is_empty_value(slot_value): - current_values[name] = to_serializable_dict(slot_value) - - # # Then handle slots if present - # if hasattr(obj.__class__, '__slots__'): - # for slot in obj.__class__.__slots__: - # # Remove leading underscore if present - # param_name = slot[1:] if slot.startswith('_') else slot - - # # Get the slot value if it exists - # if hasattr(obj, slot): - # value = getattr(obj, slot) - # if not is_empty_value(value): - # current_values[param_name] = to_serializable_dict(value) + _type = obj.__class__.__name__ return { "type": obj.__class__.__name__, @@ -126,10 +108,7 @@ def from_serializable_dict(data: Any) -> Any: # Import from crawl4ai for class instances import crawl4ai - if not hasattr(crawl4ai, data["type"]): - return None - else: - cls = getattr(crawl4ai, data["type"]) + cls = getattr(crawl4ai, data["type"]) # Handle Enum if issubclass(cls, Enum): @@ -390,16 +369,72 @@ class BrowserConfig(): def load( data: dict) -> "BrowserConfig": # Deserialize the object from a dictionary config = from_serializable_dict(data) - - # check if the deserialized object is an instance of BrowserConfig if isinstance(config, BrowserConfig): return config - elif isinstance(config, dict): - return BrowserConfig.from_kwargs(config) - else: - raise ValueError("Invalid data type for BrowserConfig") + return BrowserConfig.from_kwargs(config) +class HTTPCrawlerConfig(): + """HTTP-specific crawler configuration""" + method: str = "GET" + headers: Optional[Dict[str, str]] = None + data: Optional[Dict[str, Any]] = None + json: Optional[Dict[str, Any]] = None + follow_redirects: bool = True + verify_ssl: bool = True + + def __init__(self, method: str = "GET", headers: Optional[Dict[str, str]] = None, data: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, follow_redirects: bool = True, verify_ssl: bool = True): + self.method = method + self.headers = headers + self.data = data + self.json = json + self.follow_redirects = follow_redirects + self.verify_ssl = verify_ssl + + @staticmethod + def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig": + return HTTPCrawlerConfig( + method=kwargs.get("method", "GET"), + headers=kwargs.get("headers"), + data=kwargs.get("data"), + json=kwargs.get("json"), + follow_redirects=kwargs.get("follow_redirects", True), + verify_ssl=kwargs.get("verify_ssl", True), + ) + + def to_dict(self): + return { + "method": self.method, + "headers": self.headers, + "data": self.data, + "json": self.json, + "follow_redirects": self.follow_redirects, + "verify_ssl": self.verify_ssl, + } + + def clone(self, **kwargs): + """Create a copy of this configuration with updated values. + + Args: + **kwargs: Key-value pairs of configuration options to update + + Returns: + HTTPCrawlerConfig: A new instance with the specified updates + """ + config_dict = self.to_dict() + config_dict.update(kwargs) + return HTTPCrawlerConfig.from_kwargs(config_dict) + + def dump(self) -> dict: + return to_serializable_dict(self) + + @staticmethod + def load(data: dict) -> "HTTPCrawlerConfig": + config = from_serializable_dict(data) + if isinstance(config, HTTPCrawlerConfig): + return config + return HTTPCrawlerConfig.from_kwargs(config) + class CrawlerRunConfig(): """ Configuration class for controlling how the crawler runs each crawl operation. @@ -450,7 +485,7 @@ class CrawlerRunConfig(): # Caching Parameters cache_mode (CacheMode or None): Defines how caching is handled. If None, defaults to CacheMode.ENABLED internally. - Default: None. + Default: CacheMode.BYPASS. session_id (str or None): Optional session ID to persist the browser context and the created page instance. If the ID already exists, the crawler does not create a new page and uses the current page to preserve the state. @@ -543,19 +578,27 @@ class CrawlerRunConfig(): log_console (bool): If True, log console messages from the page. Default: False. - # Streaming Parameters + # HTTP Crwler Strategy Parameters + method (str): HTTP method to use for the request, when using AsyncHTTPCrwalerStrategy. + Default: "GET". + data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy. + Default: None. + json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy. + + # Connection Parameters stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many. Default: False. - - # Optional Parameters - stream (bool): If True, stream the page content as it is being loaded. - url: str = None # This is not a compulsory parameter + check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False - user_agent (str): Custom User-Agent string to use. Default: None - user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided - user_agent as-is. Default: None. + Default: False. + user_agent (str): Custom User-Agent string to use. + Default: None. + user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is. + Default: None. user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. Default: None. + + url: str = None # This is not a compulsory parameter """ def __init__( @@ -580,7 +623,7 @@ class CrawlerRunConfig(): # SSL Parameters fetch_ssl_certificate: bool = False, # Caching Parameters - cache_mode: CacheMode =None, + cache_mode: CacheMode = CacheMode.BYPASS, session_id: str = None, bypass_cache: bool = False, disable_cache: bool = False, @@ -625,7 +668,8 @@ class CrawlerRunConfig(): # Debugging and Logging Parameters verbose: bool = True, log_console: bool = False, - # Streaming Parameters + # Connection Parameters + method: str = "GET", stream: bool = False, url: str = None, check_robots_txt: bool = False, @@ -713,8 +757,9 @@ class CrawlerRunConfig(): self.verbose = verbose self.log_console = log_console - # Streaming Parameters + # Connection Parameters self.stream = stream + self.method = method # Robots.txt Handling Parameters self.check_robots_txt = check_robots_txt @@ -769,7 +814,7 @@ class CrawlerRunConfig(): # SSL Parameters fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), # Caching Parameters - cache_mode=kwargs.get("cache_mode"), + cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS), session_id=kwargs.get("session_id"), bypass_cache=kwargs.get("bypass_cache", False), disable_cache=kwargs.get("disable_cache", False), @@ -823,15 +868,17 @@ class CrawlerRunConfig(): # Debugging and Logging Parameters verbose=kwargs.get("verbose", True), log_console=kwargs.get("log_console", False), - # Streaming Parameters + # Connection Parameters + method=kwargs.get("method", "GET"), stream=kwargs.get("stream", False), - url=kwargs.get("url"), check_robots_txt=kwargs.get("check_robots_txt", False), user_agent=kwargs.get("user_agent"), user_agent_mode=kwargs.get("user_agent_mode"), user_agent_generator_config=kwargs.get("user_agent_generator_config", {}), # Deep Crawl Parameters deep_crawl_strategy=kwargs.get("deep_crawl_strategy"), + + url=kwargs.get("url"), ) # Create a funciton returns dict of the object @@ -843,13 +890,9 @@ class CrawlerRunConfig(): def load(data: dict) -> "CrawlerRunConfig": # Deserialize the object from a dictionary config = from_serializable_dict(data) - # If config type is alread instant of CrawleRunConfig, return it if isinstance(config, CrawlerRunConfig): return config - elif isinstance(config, dict): - return CrawlerRunConfig.from_kwargs(config) - else: - raise ValueError("Invalid data type") + return CrawlerRunConfig.from_kwargs(config) def to_dict(self): return { @@ -910,13 +953,14 @@ class CrawlerRunConfig(): "exclude_internal_links": self.exclude_internal_links, "verbose": self.verbose, "log_console": self.log_console, + "method": self.method, "stream": self.stream, - "url": self.url, "check_robots_txt": self.check_robots_txt, "user_agent": self.user_agent, "user_agent_mode": self.user_agent_mode, "user_agent_generator_config": self.user_agent_generator_config, "deep_crawl_strategy": self.deep_crawl_strategy, + "url": self.url, } def clone(self, **kwargs): diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 9ae9b5a8..d93e27d1 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1,14 +1,13 @@ +from __future__ import annotations + import asyncio import base64 import time from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional, Union +from typing import Callable, Dict, Any, List, Union +from typing import Optional, AsyncGenerator, Final import os -import sys -import shutil -import tempfile -import subprocess -from playwright.async_api import Page, Error, BrowserContext +from playwright.async_api import Page, Error from playwright.async_api import TimeoutError as PlaywrightTimeoutError from io import BytesIO from PIL import Image, ImageDraw, ImageFont @@ -16,796 +15,21 @@ import hashlib import uuid from .js_snippet import load_js_script from .models import AsyncCrawlResponse -from .user_agent_generator import UserAgentGenerator -from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT -from .async_configs import BrowserConfig, CrawlerRunConfig +from .config import SCREENSHOT_HEIGHT_TRESHOLD +from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig from .async_logger import AsyncLogger -from playwright_stealth import StealthConfig from .ssl_certificate import SSLCertificate -from .utils import get_home_folder, get_chromium_path -from .user_agent_generator import ValidUAGenerator, OnlineUAGenerator - -stealth_config = StealthConfig( - webdriver=True, - chrome_app=True, - chrome_csi=True, - chrome_load_times=True, - chrome_runtime=True, - navigator_languages=True, - navigator_plugins=True, - navigator_permissions=True, - webgl_vendor=True, - outerdimensions=True, - navigator_hardware_concurrency=True, - media_codecs=True, -) - -BROWSER_DISABLE_OPTIONS = [ - "--disable-background-networking", - "--disable-background-timer-throttling", - "--disable-backgrounding-occluded-windows", - "--disable-breakpad", - "--disable-client-side-phishing-detection", - "--disable-component-extensions-with-background-pages", - "--disable-default-apps", - "--disable-extensions", - "--disable-features=TranslateUI", - "--disable-hang-monitor", - "--disable-ipc-flooding-protection", - "--disable-popup-blocking", - "--disable-prompt-on-repost", - "--disable-sync", - "--force-color-profile=srgb", - "--metrics-recording-only", - "--no-first-run", - "--password-store=basic", - "--use-mock-keychain", -] - - -class ManagedBrowser: - """ - Manages the browser process and context. This class allows to connect to the browser using CDP protocol. - - Attributes: - browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". - Default: "chromium". - user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a - temporary directory may be used. Default: None. - headless (bool): Whether to run the browser in headless mode (no visible GUI). - Default: True. - browser_process (subprocess.Popen): The process object for the browser. - temp_dir (str): Temporary directory for user data if not provided. - debugging_port (int): Port for debugging the browser. - host (str): Host for debugging the browser. - - Methods: - start(): Starts the browser process and returns the CDP endpoint URL. - _get_browser_path(): Returns the browser executable path based on OS and browser type. - _get_browser_args(): Returns browser-specific command line arguments. - _get_user_data_dir(): Returns the user data directory path. - _cleanup(): Terminates the browser process and removes the temporary directory. - """ - - browser_type: str - user_data_dir: str - headless: bool - browser_process: subprocess.Popen - temp_dir: str - debugging_port: int - host: str - - def __init__( - self, - browser_type: str = "chromium", - user_data_dir: Optional[str] = None, - headless: bool = False, - logger=None, - host: str = "localhost", - debugging_port: int = 9222, - cdp_url: Optional[str] = None, - ): - """ - Initialize the ManagedBrowser instance. - - Args: - browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". - Default: "chromium". - user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a - temporary directory may be used. Default: None. - headless (bool): Whether to run the browser in headless mode (no visible GUI). - Default: True. - logger (logging.Logger): Logger instance for logging messages. Default: None. - host (str): Host for debugging the browser. Default: "localhost". - debugging_port (int): Port for debugging the browser. Default: 9222. - cdp_url (str or None): CDP URL to connect to the browser. Default: None. - """ - self.browser_type = browser_type - self.user_data_dir = user_data_dir - self.headless = headless - self.browser_process = None - self.temp_dir = None - self.debugging_port = debugging_port - self.host = host - self.logger = logger - self.shutting_down = False - self.cdp_url = cdp_url - - async def start(self) -> str: - """ - Starts the browser process or returns CDP endpoint URL. - If cdp_url is provided, returns it directly. - If user_data_dir is not provided for local browser, creates a temporary directory. - - Returns: - str: CDP endpoint URL - """ - # If CDP URL provided, just return it - if self.cdp_url: - return self.cdp_url - - # Create temp dir if needed - if not self.user_data_dir: - self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") - self.user_data_dir = self.temp_dir - - # Get browser path and args based on OS and browser type - # browser_path = self._get_browser_path() - args = await self._get_browser_args() - - # Start browser process - try: - self.browser_process = subprocess.Popen( - args, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - # Monitor browser process output for errors - asyncio.create_task(self._monitor_browser_process()) - await asyncio.sleep(2) # Give browser time to start - return f"http://{self.host}:{self.debugging_port}" - except Exception as e: - await self.cleanup() - raise Exception(f"Failed to start browser: {e}") - - async def _monitor_browser_process(self): - """ - Monitor the browser process for unexpected termination. - - How it works: - 1. Read stdout and stderr from the browser process. - 2. If the process has terminated, log the error message and terminate the browser. - 3. If the shutting_down flag is set, log the normal termination message. - 4. If any other error occurs, log the error message. - - Note: This method should be called in a separate task to avoid blocking the main event loop. - """ - if self.browser_process: - try: - stdout, stderr = await asyncio.gather( - asyncio.to_thread(self.browser_process.stdout.read), - asyncio.to_thread(self.browser_process.stderr.read), - ) - - # Check shutting_down flag BEFORE logging anything - if self.browser_process.poll() is not None: - if not self.shutting_down: - self.logger.error( - message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", - tag="ERROR", - params={ - "code": self.browser_process.returncode, - "stdout": stdout.decode(), - "stderr": stderr.decode(), - }, - ) - await self.cleanup() - else: - self.logger.info( - message="Browser process terminated normally | Code: {code}", - tag="INFO", - params={"code": self.browser_process.returncode}, - ) - except Exception as e: - if not self.shutting_down: - self.logger.error( - message="Error monitoring browser process: {error}", - tag="ERROR", - params={"error": str(e)}, - ) - - def _get_browser_path_WIP(self) -> str: - """Returns the browser executable path based on OS and browser type""" - if sys.platform == "darwin": # macOS - paths = { - "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", - "webkit": "/Applications/Safari.app/Contents/MacOS/Safari", - } - elif sys.platform == "win32": # Windows - paths = { - "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", - "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", - "webkit": None, # WebKit not supported on Windows - } - else: # Linux - paths = { - "chromium": "google-chrome", - "firefox": "firefox", - "webkit": None, # WebKit not supported on Linux - } - - return paths.get(self.browser_type) - - async def _get_browser_path(self) -> str: - browser_path = await get_chromium_path(self.browser_type) - return browser_path - - async def _get_browser_args(self) -> List[str]: - """Returns browser-specific command line arguments""" - base_args = [await self._get_browser_path()] - - if self.browser_type == "chromium": - args = [ - f"--remote-debugging-port={self.debugging_port}", - f"--user-data-dir={self.user_data_dir}", - ] - if self.headless: - args.append("--headless=new") - elif self.browser_type == "firefox": - args = [ - "--remote-debugging-port", - str(self.debugging_port), - "--profile", - self.user_data_dir, - ] - if self.headless: - args.append("--headless") - else: - raise NotImplementedError(f"Browser type {self.browser_type} not supported") - - return base_args + args - - async def cleanup(self): - """Cleanup browser process and temporary directory""" - # Set shutting_down flag BEFORE any termination actions - self.shutting_down = True - - if self.browser_process: - try: - self.browser_process.terminate() - # Wait for process to end gracefully - for _ in range(10): # 10 attempts, 100ms each - if self.browser_process.poll() is not None: - break - await asyncio.sleep(0.1) - - # Force kill if still running - if self.browser_process.poll() is None: - self.browser_process.kill() - await asyncio.sleep(0.1) # Brief wait for kill to take effect - - except Exception as e: - self.logger.error( - message="Error terminating browser: {error}", - tag="ERROR", - params={"error": str(e)}, - ) - - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - except Exception as e: - self.logger.error( - message="Error removing temporary directory: {error}", - tag="ERROR", - params={"error": str(e)}, - ) - - -class BrowserManager: - """ - Manages the browser instance and context. - - Attributes: - config (BrowserConfig): Configuration object containing all browser settings - logger: Logger instance for recording events and errors - browser (Browser): The browser instance - default_context (BrowserContext): The default browser context - managed_browser (ManagedBrowser): The managed browser instance - playwright (Playwright): The Playwright instance - sessions (dict): Dictionary to store session information - session_ttl (int): Session timeout in seconds - """ - - def __init__(self, browser_config: BrowserConfig, logger=None): - """ - Initialize the BrowserManager with a browser configuration. - - Args: - browser_config (BrowserConfig): Configuration object containing all browser settings - logger: Logger instance for recording events and errors - """ - self.config: BrowserConfig = browser_config - self.logger = logger - - # Browser state - self.browser = None - self.default_context = None - self.managed_browser = None - self.playwright = None - - # Session management - self.sessions = {} - self.session_ttl = 1800 # 30 minutes - - # Keep track of contexts by a "config signature," so each unique config reuses a single context - self.contexts_by_config = {} - self._contexts_lock = asyncio.Lock() - - # Initialize ManagedBrowser if needed - if self.config.use_managed_browser: - self.managed_browser = ManagedBrowser( - browser_type=self.config.browser_type, - user_data_dir=self.config.user_data_dir, - headless=self.config.headless, - logger=self.logger, - debugging_port=self.config.debugging_port, - ) - - async def start(self): - """ - Start the browser instance and set up the default context. - - How it works: - 1. Check if Playwright is already initialized. - 2. If not, initialize Playwright. - 3. If managed browser is used, start it and connect to the CDP endpoint. - 4. If managed browser is not used, launch the browser and set up the default context. - - Note: This method should be called in a separate task to avoid blocking the main event loop. - """ - if self.playwright is None: - from playwright.async_api import async_playwright - - self.playwright = await async_playwright().start() - - if self.config.use_managed_browser: - cdp_url = await self.managed_browser.start() - self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) - contexts = self.browser.contexts - if contexts: - self.default_context = contexts[0] - else: - self.default_context = await self.create_browser_context() - # self.default_context = await self.browser.new_context( - # viewport={ - # "width": self.config.viewport_width, - # "height": self.config.viewport_height, - # }, - # storage_state=self.config.storage_state, - # user_agent=self.config.headers.get( - # "User-Agent", self.config.user_agent - # ), - # accept_downloads=self.config.accept_downloads, - # ignore_https_errors=self.config.ignore_https_errors, - # java_script_enabled=self.config.java_script_enabled, - # ) - await self.setup_context(self.default_context) - else: - browser_args = self._build_browser_args() - - # Launch appropriate browser type - if self.config.browser_type == "firefox": - self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.config.browser_type == "webkit": - self.browser = await self.playwright.webkit.launch(**browser_args) - else: - self.browser = await self.playwright.chromium.launch(**browser_args) - - self.default_context = self.browser - - def _build_browser_args(self) -> dict: - """Build browser launch arguments from config.""" - args = [ - "--disable-gpu", - "--disable-gpu-compositing", - "--disable-software-rasterizer", - "--no-sandbox", - "--disable-dev-shm-usage", - "--no-first-run", - "--no-default-browser-check", - "--disable-infobars", - "--window-position=0,0", - "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", - "--disable-blink-features=AutomationControlled", - "--window-position=400,0", - "--disable-renderer-backgrounding", - "--disable-ipc-flooding-protection", - "--force-color-profile=srgb", - "--mute-audio", - "--disable-background-timer-throttling", - # "--single-process", - f"--window-size={self.config.viewport_width},{self.config.viewport_height}", - ] - - if self.config.light_mode: - args.extend(BROWSER_DISABLE_OPTIONS) - - if self.config.text_mode: - args.extend( - [ - "--blink-settings=imagesEnabled=false", - "--disable-remote-fonts", - "--disable-images", - "--disable-javascript", - "--disable-software-rasterizer", - "--disable-dev-shm-usage", - ] - ) - - if self.config.extra_args: - args.extend(self.config.extra_args) - - browser_args = {"headless": self.config.headless, "args": args} - - if self.config.chrome_channel: - browser_args["channel"] = self.config.chrome_channel - - if self.config.accept_downloads: - browser_args["downloads_path"] = self.config.downloads_path or os.path.join( - os.getcwd(), "downloads" - ) - os.makedirs(browser_args["downloads_path"], exist_ok=True) - - if self.config.proxy or self.config.proxy_config: - from playwright.async_api import ProxySettings - - proxy_settings = ( - ProxySettings(server=self.config.proxy) - if self.config.proxy - else ProxySettings( - server=self.config.proxy_config.get("server"), - username=self.config.proxy_config.get("username"), - password=self.config.proxy_config.get("password"), - ) - ) - browser_args["proxy"] = proxy_settings - - return browser_args - - async def setup_context( - self, - context: BrowserContext, - crawlerRunConfig: CrawlerRunConfig = None, - is_default=False, - ): - """ - Set up a browser context with the configured options. - - How it works: - 1. Set extra HTTP headers if provided. - 2. Add cookies if provided. - 3. Load storage state if provided. - 4. Accept downloads if enabled. - 5. Set default timeouts for navigation and download. - 6. Set user agent if provided. - 7. Set browser hints if provided. - 8. Set proxy if provided. - 9. Set downloads path if provided. - 10. Set storage state if provided. - 11. Set cache if provided. - 12. Set extra HTTP headers if provided. - 13. Add cookies if provided. - 14. Set default timeouts for navigation and download if enabled. - 15. Set user agent if provided. - 16. Set browser hints if provided. - - Args: - context (BrowserContext): The browser context to set up - crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings - is_default (bool): Flag indicating if this is the default context - Returns: - None - """ - if self.config.headers: - await context.set_extra_http_headers(self.config.headers) - - if self.config.cookies: - await context.add_cookies(self.config.cookies) - - if self.config.storage_state: - await context.storage_state(path=None) - - if self.config.accept_downloads: - context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) - context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) - if self.config.downloads_path: - context._impl_obj._options["accept_downloads"] = True - context._impl_obj._options[ - "downloads_path" - ] = self.config.downloads_path - - # Handle user agent and browser hints - if self.config.user_agent: - combined_headers = { - "User-Agent": self.config.user_agent, - "sec-ch-ua": self.config.browser_hint, - } - combined_headers.update(self.config.headers) - await context.set_extra_http_headers(combined_headers) - - # Add default cookie - await context.add_cookies( - [ - { - "name": "cookiesEnabled", - "value": "true", - "url": crawlerRunConfig.url - if crawlerRunConfig - else "https://crawl4ai.com/", - } - ] - ) - - # Handle navigator overrides - if crawlerRunConfig: - if ( - crawlerRunConfig.override_navigator - or crawlerRunConfig.simulate_user - or crawlerRunConfig.magic - ): - await context.add_init_script(load_js_script("navigator_overrider")) - - async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None): - """ - Creates and returns a new browser context with configured settings. - Applies text-only mode settings if text_mode is enabled in config. - - Returns: - Context: Browser context object with the specified configurations - """ - # Base settings - user_agent = self.config.headers.get("User-Agent", self.config.user_agent) - viewport_settings = { - "width": self.config.viewport_width, - "height": self.config.viewport_height, - } - proxy_settings = {"server": self.config.proxy} if self.config.proxy else None - - blocked_extensions = [ - # Images - "jpg", - "jpeg", - "png", - "gif", - "webp", - "svg", - "ico", - "bmp", - "tiff", - "psd", - # Fonts - "woff", - "woff2", - "ttf", - "otf", - "eot", - # Styles - # 'css', 'less', 'scss', 'sass', - # Media - "mp4", - "webm", - "ogg", - "avi", - "mov", - "wmv", - "flv", - "m4v", - "mp3", - "wav", - "aac", - "m4a", - "opus", - "flac", - # Documents - "pdf", - "doc", - "docx", - "xls", - "xlsx", - "ppt", - "pptx", - # Archives - "zip", - "rar", - "7z", - "tar", - "gz", - # Scripts and data - "xml", - "swf", - "wasm", - ] - - # Common context settings - context_settings = { - "user_agent": user_agent, - "viewport": viewport_settings, - "proxy": proxy_settings, - "accept_downloads": self.config.accept_downloads, - "storage_state": self.config.storage_state, - "ignore_https_errors": self.config.ignore_https_errors, - "device_scale_factor": 1.0, - "java_script_enabled": self.config.java_script_enabled, - } - - if crawlerRunConfig: - # Check if there is value for crawlerRunConfig.proxy_config set add that to context - if crawlerRunConfig.proxy_config: - proxy_settings = { - "server": crawlerRunConfig.proxy_config.get("server"), - } - if crawlerRunConfig.proxy_config.get("username"): - proxy_settings.update({ - "username": crawlerRunConfig.proxy_config.get("username"), - "password": crawlerRunConfig.proxy_config.get("password"), - }) - context_settings["proxy"] = proxy_settings - - if self.config.text_mode: - text_mode_settings = { - "has_touch": False, - "is_mobile": False, - } - # Update context settings with text mode settings - context_settings.update(text_mode_settings) - - # Create and return the context with all settings - context = await self.browser.new_context(**context_settings) - - # Apply text mode settings if enabled - if self.config.text_mode: - # Create and apply route patterns for each extension - for ext in blocked_extensions: - await context.route(f"**/*.{ext}", lambda route: route.abort()) - return context - - def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: - """ - Converts the crawlerRunConfig into a dict, excludes ephemeral fields, - then returns a hash of the sorted JSON. This yields a stable signature - that identifies configurations requiring a unique browser context. - """ - import json, hashlib - - config_dict = crawlerRunConfig.__dict__.copy() - # Exclude items that do not affect browser-level setup. - # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config. - ephemeral_keys = [ - "session_id", - "js_code", - "scraping_strategy", - "extraction_strategy", - "chunking_strategy", - "cache_mode", - "content_filter", - "semaphore_count", - "url" - ] - for key in ephemeral_keys: - if key in config_dict: - del config_dict[key] - # Convert to canonical JSON string - signature_json = json.dumps(config_dict, sort_keys=True, default=str) - - # Hash the JSON so we get a compact, unique string - signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest() - return signature_hash - - async def get_page(self, crawlerRunConfig: CrawlerRunConfig): - """ - Get a page for the given session ID, creating a new one if needed. - - Args: - crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings - - Returns: - (page, context): The Page and its BrowserContext - """ - self._cleanup_expired_sessions() - - # If a session_id is provided and we already have it, reuse that page + context - if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: - context, page, _ = self.sessions[crawlerRunConfig.session_id] - # Update last-used timestamp - self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) - return page, context - - # If using a managed browser, just grab the shared default_context - if self.config.use_managed_browser: - context = self.default_context - page = await context.new_page() - else: - # Otherwise, check if we have an existing context for this config - config_signature = self._make_config_signature(crawlerRunConfig) - - async with self._contexts_lock: - if config_signature in self.contexts_by_config: - context = self.contexts_by_config[config_signature] - else: - # Create and setup a new context - context = await self.create_browser_context(crawlerRunConfig) - await self.setup_context(context, crawlerRunConfig) - self.contexts_by_config[config_signature] = context - - # Create a new page from the chosen context - page = await context.new_page() - - # If a session_id is specified, store this session so we can reuse later - if crawlerRunConfig.session_id: - self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) - - return page, context - - async def kill_session(self, session_id: str): - """ - Kill a browser session and clean up resources. - - Args: - session_id (str): The session ID to kill. - """ - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - if not self.config.use_managed_browser: - await context.close() - del self.sessions[session_id] - - def _cleanup_expired_sessions(self): - """Clean up expired sessions based on TTL.""" - current_time = time.time() - expired_sessions = [ - sid - for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self.kill_session(sid)) - - async def close(self): - """Close all browser resources and clean up.""" - if self.config.sleep_on_close: - await asyncio.sleep(0.5) - - session_ids = list(self.sessions.keys()) - for session_id in session_ids: - await self.kill_session(session_id) - - # Now close all contexts we created. This reclaims memory from ephemeral contexts. - for ctx in self.contexts_by_config.values(): - try: - await ctx.close() - except Exception as e: - self.logger.error( - message="Error closing context: {error}", - tag="ERROR", - params={"error": str(e)} - ) - self.contexts_by_config.clear() - - if self.browser: - await self.browser.close() - self.browser = None - - if self.managed_browser: - await asyncio.sleep(0.5) - await self.managed_browser.cleanup() - self.managed_browser = None - - if self.playwright: - await self.playwright.stop() - self.playwright = None - +from .user_agent_generator import ValidUAGenerator +from .browser_manager import BrowserManager + +import aiofiles +import aiohttp +import cchardet +from aiohttp.client import ClientTimeout +from urllib.parse import urlparse +from types import MappingProxyType +import contextlib +from functools import partial class AsyncCrawlerStrategy(ABC): """ @@ -817,7 +41,6 @@ class AsyncCrawlerStrategy(ABC): async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: pass # 4 + 3 - class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ Crawler strategy using Playwright. @@ -2369,3 +1592,267 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): params={"error": str(e)}, ) return True # Default to scrolling if check fails + + +#################################################################################################### +# HTTP Crawler Strategy +#################################################################################################### + +class HTTPCrawlerError(Exception): + """Base error class for HTTP crawler specific exceptions""" + pass + + +class ConnectionTimeoutError(HTTPCrawlerError): + """Raised when connection timeout occurs""" + pass + + +class HTTPStatusError(HTTPCrawlerError): + """Raised for unexpected status codes""" + def __init__(self, status_code: int, message: str): + self.status_code = status_code + super().__init__(f"HTTP {status_code}: {message}") + + +class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): + """ + Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency. + """ + + __slots__ = ('logger', 'max_connections', 'dns_cache_ttl', 'chunk_size', '_session', 'hooks', 'browser_config') + + DEFAULT_TIMEOUT: Final[int] = 30 + DEFAULT_CHUNK_SIZE: Final[int] = 64 * 1024 + DEFAULT_MAX_CONNECTIONS: Final[int] = min(32, (os.cpu_count() or 1) * 4) + DEFAULT_DNS_CACHE_TTL: Final[int] = 300 + VALID_SCHEMES: Final = frozenset({'http', 'https', 'file', 'raw'}) + + _BASE_HEADERS: Final = MappingProxyType({ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }) + + def __init__( + self, + browser_config: Optional[HTTPCrawlerConfig] = None, + logger: Optional[AsyncLogger] = None, + max_connections: int = DEFAULT_MAX_CONNECTIONS, + dns_cache_ttl: int = DEFAULT_DNS_CACHE_TTL, + chunk_size: int = DEFAULT_CHUNK_SIZE + ): + """Initialize the HTTP crawler with config""" + self.browser_config = browser_config or HTTPCrawlerConfig() + self.logger = logger + self.max_connections = max_connections + self.dns_cache_ttl = dns_cache_ttl + self.chunk_size = chunk_size + self._session: Optional[aiohttp.ClientSession] = None + + self.hooks = { + k: partial(self._execute_hook, k) + for k in ('before_request', 'after_request', 'on_error') + } + + # Set default hooks + self.set_hook('before_request', lambda *args, **kwargs: None) + self.set_hook('after_request', lambda *args, **kwargs: None) + self.set_hook('on_error', lambda *args, **kwargs: None) + + + async def __aenter__(self) -> AsyncHTTPCrawlerStrategy: + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + await self.close() + + @contextlib.asynccontextmanager + async def _session_context(self): + try: + if not self._session: + await self.start() + yield self._session + finally: + await self.close() + + def set_hook(self, hook_type: str, hook_func: Callable) -> None: + if hook_type in self.hooks: + self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func) + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + async def _execute_hook( + self, + hook_type: str, + hook_func: Callable, + *args: Any, + **kwargs: Any + ) -> Any: + if asyncio.iscoroutinefunction(hook_func): + return await hook_func(*args, **kwargs) + return hook_func(*args, **kwargs) + + async def start(self) -> None: + if not self._session: + connector = aiohttp.TCPConnector( + limit=self.max_connections, + ttl_dns_cache=self.dns_cache_ttl, + use_dns_cache=True, + force_close=False + ) + self._session = aiohttp.ClientSession( + headers=dict(self._BASE_HEADERS), + connector=connector, + timeout=ClientTimeout(total=self.DEFAULT_TIMEOUT) + ) + + async def close(self) -> None: + if self._session and not self._session.closed: + try: + await asyncio.wait_for(self._session.close(), timeout=5.0) + except asyncio.TimeoutError: + if self.logger: + self.logger.warning( + message="Session cleanup timed out", + tag="CLEANUP" + ) + finally: + self._session = None + + async def _stream_file(self, path: str) -> AsyncGenerator[memoryview, None]: + async with aiofiles.open(path, mode='rb') as f: + while chunk := await f.read(self.chunk_size): + yield memoryview(chunk) + + async def _handle_file(self, path: str) -> AsyncCrawlResponse: + if not os.path.exists(path): + raise FileNotFoundError(f"Local file not found: {path}") + + chunks = [] + async for chunk in self._stream_file(path): + chunks.append(chunk.tobytes().decode('utf-8', errors='replace')) + + return AsyncCrawlResponse( + html=''.join(chunks), + response_headers={}, + status_code=200 + ) + + async def _handle_raw(self, content: str) -> AsyncCrawlResponse: + return AsyncCrawlResponse( + html=content, + response_headers={}, + status_code=200 + ) + + + async def _handle_http( + self, + url: str, + config: CrawlerRunConfig + ) -> AsyncCrawlResponse: + async with self._session_context() as session: + timeout = ClientTimeout( + total=config.page_timeout or self.DEFAULT_TIMEOUT, + connect=10, + sock_read=30 + ) + + headers = dict(self._BASE_HEADERS) + if self.browser_config.headers: + headers.update(self.browser_config.headers) + + request_kwargs = { + 'timeout': timeout, + 'allow_redirects': self.browser_config.follow_redirects, + 'ssl': self.browser_config.verify_ssl, + 'headers': headers + } + + if self.browser_config.method == "POST": + if self.browser_config.data: + request_kwargs['data'] = self.browser_config.data + if self.browser_config.json: + request_kwargs['json'] = self.browser_config.json + + await self.hooks['before_request'](url, request_kwargs) + + try: + async with session.request(self.browser_config.method, url, **request_kwargs) as response: + content = memoryview(await response.read()) + + if not (200 <= response.status < 300): + raise HTTPStatusError( + response.status, + f"Unexpected status code for {url}" + ) + + encoding = response.charset + if not encoding: + encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8' + + result = AsyncCrawlResponse( + html=content.tobytes().decode(encoding, errors='replace'), + response_headers=dict(response.headers), + status_code=response.status, + redirected_url=str(response.url) + ) + + await self.hooks['after_request'](result) + return result + + except aiohttp.ServerTimeoutError as e: + await self.hooks['on_error'](e) + raise ConnectionTimeoutError(f"Request timed out: {str(e)}") + + except aiohttp.ClientConnectorError as e: + await self.hooks['on_error'](e) + raise ConnectionError(f"Connection failed: {str(e)}") + + except aiohttp.ClientError as e: + await self.hooks['on_error'](e) + raise HTTPCrawlerError(f"HTTP client error: {str(e)}") + + except asyncio.exceptions.TimeoutError as e: + await self.hooks['on_error'](e) + raise ConnectionTimeoutError(f"Request timed out: {str(e)}") + + except Exception as e: + await self.hooks['on_error'](e) + raise HTTPCrawlerError(f"HTTP request failed: {str(e)}") + + async def crawl( + self, + url: str, + config: Optional[CrawlerRunConfig] = None, + **kwargs + ) -> AsyncCrawlResponse: + config = config or CrawlerRunConfig.from_kwargs(kwargs) + + parsed = urlparse(url) + scheme = parsed.scheme.rstrip('/') + + if scheme not in self.VALID_SCHEMES: + raise ValueError(f"Unsupported URL scheme: {scheme}") + + try: + if scheme == 'file': + return await self._handle_file(parsed.path) + elif scheme == 'raw': + return await self._handle_raw(parsed.path) + else: # http or https + return await self._handle_http(url, config) + + except Exception as e: + if self.logger: + self.logger.error( + message="Crawl failed: {error}", + tag="CRAWL", + params={"error": str(e), "url": url} + ) + raise \ No newline at end of file diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py new file mode 100644 index 00000000..0462cb84 --- /dev/null +++ b/crawl4ai/browser_manager.py @@ -0,0 +1,796 @@ +import asyncio +import time +from typing import List, Optional +import os +import sys +import shutil +import tempfile +import subprocess +from playwright.async_api import BrowserContext +import hashlib +from .js_snippet import load_js_script +from .config import DOWNLOAD_PAGE_TIMEOUT +from .async_configs import BrowserConfig, CrawlerRunConfig +from playwright_stealth import StealthConfig +from .utils import get_chromium_path + +stealth_config = StealthConfig( + webdriver=True, + chrome_app=True, + chrome_csi=True, + chrome_load_times=True, + chrome_runtime=True, + navigator_languages=True, + navigator_plugins=True, + navigator_permissions=True, + webgl_vendor=True, + outerdimensions=True, + navigator_hardware_concurrency=True, + media_codecs=True, +) + +BROWSER_DISABLE_OPTIONS = [ + "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-breakpad", + "--disable-client-side-phishing-detection", + "--disable-component-extensions-with-background-pages", + "--disable-default-apps", + "--disable-extensions", + "--disable-features=TranslateUI", + "--disable-hang-monitor", + "--disable-ipc-flooding-protection", + "--disable-popup-blocking", + "--disable-prompt-on-repost", + "--disable-sync", + "--force-color-profile=srgb", + "--metrics-recording-only", + "--no-first-run", + "--password-store=basic", + "--use-mock-keychain", +] + + +class ManagedBrowser: + """ + Manages the browser process and context. This class allows to connect to the browser using CDP protocol. + + Attributes: + browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". + Default: "chromium". + user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a + temporary directory may be used. Default: None. + headless (bool): Whether to run the browser in headless mode (no visible GUI). + Default: True. + browser_process (subprocess.Popen): The process object for the browser. + temp_dir (str): Temporary directory for user data if not provided. + debugging_port (int): Port for debugging the browser. + host (str): Host for debugging the browser. + + Methods: + start(): Starts the browser process and returns the CDP endpoint URL. + _get_browser_path(): Returns the browser executable path based on OS and browser type. + _get_browser_args(): Returns browser-specific command line arguments. + _get_user_data_dir(): Returns the user data directory path. + _cleanup(): Terminates the browser process and removes the temporary directory. + """ + + browser_type: str + user_data_dir: str + headless: bool + browser_process: subprocess.Popen + temp_dir: str + debugging_port: int + host: str + + def __init__( + self, + browser_type: str = "chromium", + user_data_dir: Optional[str] = None, + headless: bool = False, + logger=None, + host: str = "localhost", + debugging_port: int = 9222, + cdp_url: Optional[str] = None, + ): + """ + Initialize the ManagedBrowser instance. + + Args: + browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". + Default: "chromium". + user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a + temporary directory may be used. Default: None. + headless (bool): Whether to run the browser in headless mode (no visible GUI). + Default: True. + logger (logging.Logger): Logger instance for logging messages. Default: None. + host (str): Host for debugging the browser. Default: "localhost". + debugging_port (int): Port for debugging the browser. Default: 9222. + cdp_url (str or None): CDP URL to connect to the browser. Default: None. + """ + self.browser_type = browser_type + self.user_data_dir = user_data_dir + self.headless = headless + self.browser_process = None + self.temp_dir = None + self.debugging_port = debugging_port + self.host = host + self.logger = logger + self.shutting_down = False + self.cdp_url = cdp_url + + async def start(self) -> str: + """ + Starts the browser process or returns CDP endpoint URL. + If cdp_url is provided, returns it directly. + If user_data_dir is not provided for local browser, creates a temporary directory. + + Returns: + str: CDP endpoint URL + """ + # If CDP URL provided, just return it + if self.cdp_url: + return self.cdp_url + + # Create temp dir if needed + if not self.user_data_dir: + self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") + self.user_data_dir = self.temp_dir + + # Get browser path and args based on OS and browser type + # browser_path = self._get_browser_path() + args = await self._get_browser_args() + + # Start browser process + try: + self.browser_process = subprocess.Popen( + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + # Monitor browser process output for errors + asyncio.create_task(self._monitor_browser_process()) + await asyncio.sleep(2) # Give browser time to start + return f"http://{self.host}:{self.debugging_port}" + except Exception as e: + await self.cleanup() + raise Exception(f"Failed to start browser: {e}") + + async def _monitor_browser_process(self): + """ + Monitor the browser process for unexpected termination. + + How it works: + 1. Read stdout and stderr from the browser process. + 2. If the process has terminated, log the error message and terminate the browser. + 3. If the shutting_down flag is set, log the normal termination message. + 4. If any other error occurs, log the error message. + + Note: This method should be called in a separate task to avoid blocking the main event loop. + """ + if self.browser_process: + try: + stdout, stderr = await asyncio.gather( + asyncio.to_thread(self.browser_process.stdout.read), + asyncio.to_thread(self.browser_process.stderr.read), + ) + + # Check shutting_down flag BEFORE logging anything + if self.browser_process.poll() is not None: + if not self.shutting_down: + self.logger.error( + message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": self.browser_process.returncode, + "stdout": stdout.decode(), + "stderr": stderr.decode(), + }, + ) + await self.cleanup() + else: + self.logger.info( + message="Browser process terminated normally | Code: {code}", + tag="INFO", + params={"code": self.browser_process.returncode}, + ) + except Exception as e: + if not self.shutting_down: + self.logger.error( + message="Error monitoring browser process: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + def _get_browser_path_WIP(self) -> str: + """Returns the browser executable path based on OS and browser type""" + if sys.platform == "darwin": # macOS + paths = { + "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", + "webkit": "/Applications/Safari.app/Contents/MacOS/Safari", + } + elif sys.platform == "win32": # Windows + paths = { + "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", + "webkit": None, # WebKit not supported on Windows + } + else: # Linux + paths = { + "chromium": "google-chrome", + "firefox": "firefox", + "webkit": None, # WebKit not supported on Linux + } + + return paths.get(self.browser_type) + + async def _get_browser_path(self) -> str: + browser_path = await get_chromium_path(self.browser_type) + return browser_path + + async def _get_browser_args(self) -> List[str]: + """Returns browser-specific command line arguments""" + base_args = [await self._get_browser_path()] + + if self.browser_type == "chromium": + args = [ + f"--remote-debugging-port={self.debugging_port}", + f"--user-data-dir={self.user_data_dir}", + ] + if self.headless: + args.append("--headless=new") + elif self.browser_type == "firefox": + args = [ + "--remote-debugging-port", + str(self.debugging_port), + "--profile", + self.user_data_dir, + ] + if self.headless: + args.append("--headless") + else: + raise NotImplementedError(f"Browser type {self.browser_type} not supported") + + return base_args + args + + async def cleanup(self): + """Cleanup browser process and temporary directory""" + # Set shutting_down flag BEFORE any termination actions + self.shutting_down = True + + if self.browser_process: + try: + self.browser_process.terminate() + # Wait for process to end gracefully + for _ in range(10): # 10 attempts, 100ms each + if self.browser_process.poll() is not None: + break + await asyncio.sleep(0.1) + + # Force kill if still running + if self.browser_process.poll() is None: + self.browser_process.kill() + await asyncio.sleep(0.1) # Brief wait for kill to take effect + + except Exception as e: + self.logger.error( + message="Error terminating browser: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + except Exception as e: + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + +class BrowserManager: + """ + Manages the browser instance and context. + + Attributes: + config (BrowserConfig): Configuration object containing all browser settings + logger: Logger instance for recording events and errors + browser (Browser): The browser instance + default_context (BrowserContext): The default browser context + managed_browser (ManagedBrowser): The managed browser instance + playwright (Playwright): The Playwright instance + sessions (dict): Dictionary to store session information + session_ttl (int): Session timeout in seconds + """ + + def __init__(self, browser_config: BrowserConfig, logger=None): + """ + Initialize the BrowserManager with a browser configuration. + + Args: + browser_config (BrowserConfig): Configuration object containing all browser settings + logger: Logger instance for recording events and errors + """ + self.config: BrowserConfig = browser_config + self.logger = logger + + # Browser state + self.browser = None + self.default_context = None + self.managed_browser = None + self.playwright = None + + # Session management + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + + # Keep track of contexts by a "config signature," so each unique config reuses a single context + self.contexts_by_config = {} + self._contexts_lock = asyncio.Lock() + + # Initialize ManagedBrowser if needed + if self.config.use_managed_browser: + self.managed_browser = ManagedBrowser( + browser_type=self.config.browser_type, + user_data_dir=self.config.user_data_dir, + headless=self.config.headless, + logger=self.logger, + debugging_port=self.config.debugging_port, + ) + + async def start(self): + """ + Start the browser instance and set up the default context. + + How it works: + 1. Check if Playwright is already initialized. + 2. If not, initialize Playwright. + 3. If managed browser is used, start it and connect to the CDP endpoint. + 4. If managed browser is not used, launch the browser and set up the default context. + + Note: This method should be called in a separate task to avoid blocking the main event loop. + """ + if self.playwright is None: + from playwright.async_api import async_playwright + + self.playwright = await async_playwright().start() + + if self.config.use_managed_browser: + cdp_url = await self.managed_browser.start() + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + self.default_context = await self.create_browser_context() + # self.default_context = await self.browser.new_context( + # viewport={ + # "width": self.config.viewport_width, + # "height": self.config.viewport_height, + # }, + # storage_state=self.config.storage_state, + # user_agent=self.config.headers.get( + # "User-Agent", self.config.user_agent + # ), + # accept_downloads=self.config.accept_downloads, + # ignore_https_errors=self.config.ignore_https_errors, + # java_script_enabled=self.config.java_script_enabled, + # ) + await self.setup_context(self.default_context) + else: + browser_args = self._build_browser_args() + + # Launch appropriate browser type + if self.config.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.config.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + self.default_context = self.browser + + def _build_browser_args(self) -> dict: + """Build browser launch arguments from config.""" + args = [ + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + "--no-sandbox", + "--disable-dev-shm-usage", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", + "--force-color-profile=srgb", + "--mute-audio", + "--disable-background-timer-throttling", + # "--single-process", + f"--window-size={self.config.viewport_width},{self.config.viewport_height}", + ] + + if self.config.light_mode: + args.extend(BROWSER_DISABLE_OPTIONS) + + if self.config.text_mode: + args.extend( + [ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + ] + ) + + if self.config.extra_args: + args.extend(self.config.extra_args) + + browser_args = {"headless": self.config.headless, "args": args} + + if self.config.chrome_channel: + browser_args["channel"] = self.config.chrome_channel + + if self.config.accept_downloads: + browser_args["downloads_path"] = self.config.downloads_path or os.path.join( + os.getcwd(), "downloads" + ) + os.makedirs(browser_args["downloads_path"], exist_ok=True) + + if self.config.proxy or self.config.proxy_config: + from playwright.async_api import ProxySettings + + proxy_settings = ( + ProxySettings(server=self.config.proxy) + if self.config.proxy + else ProxySettings( + server=self.config.proxy_config.get("server"), + username=self.config.proxy_config.get("username"), + password=self.config.proxy_config.get("password"), + ) + ) + browser_args["proxy"] = proxy_settings + + return browser_args + + async def setup_context( + self, + context: BrowserContext, + crawlerRunConfig: CrawlerRunConfig = None, + is_default=False, + ): + """ + Set up a browser context with the configured options. + + How it works: + 1. Set extra HTTP headers if provided. + 2. Add cookies if provided. + 3. Load storage state if provided. + 4. Accept downloads if enabled. + 5. Set default timeouts for navigation and download. + 6. Set user agent if provided. + 7. Set browser hints if provided. + 8. Set proxy if provided. + 9. Set downloads path if provided. + 10. Set storage state if provided. + 11. Set cache if provided. + 12. Set extra HTTP headers if provided. + 13. Add cookies if provided. + 14. Set default timeouts for navigation and download if enabled. + 15. Set user agent if provided. + 16. Set browser hints if provided. + + Args: + context (BrowserContext): The browser context to set up + crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings + is_default (bool): Flag indicating if this is the default context + Returns: + None + """ + if self.config.headers: + await context.set_extra_http_headers(self.config.headers) + + if self.config.cookies: + await context.add_cookies(self.config.cookies) + + if self.config.storage_state: + await context.storage_state(path=None) + + if self.config.accept_downloads: + context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) + context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) + if self.config.downloads_path: + context._impl_obj._options["accept_downloads"] = True + context._impl_obj._options[ + "downloads_path" + ] = self.config.downloads_path + + # Handle user agent and browser hints + if self.config.user_agent: + combined_headers = { + "User-Agent": self.config.user_agent, + "sec-ch-ua": self.config.browser_hint, + } + combined_headers.update(self.config.headers) + await context.set_extra_http_headers(combined_headers) + + # Add default cookie + await context.add_cookies( + [ + { + "name": "cookiesEnabled", + "value": "true", + "url": crawlerRunConfig.url + if crawlerRunConfig + else "https://crawl4ai.com/", + } + ] + ) + + # Handle navigator overrides + if crawlerRunConfig: + if ( + crawlerRunConfig.override_navigator + or crawlerRunConfig.simulate_user + or crawlerRunConfig.magic + ): + await context.add_init_script(load_js_script("navigator_overrider")) + + async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None): + """ + Creates and returns a new browser context with configured settings. + Applies text-only mode settings if text_mode is enabled in config. + + Returns: + Context: Browser context object with the specified configurations + """ + # Base settings + user_agent = self.config.headers.get("User-Agent", self.config.user_agent) + viewport_settings = { + "width": self.config.viewport_width, + "height": self.config.viewport_height, + } + proxy_settings = {"server": self.config.proxy} if self.config.proxy else None + + blocked_extensions = [ + # Images + "jpg", + "jpeg", + "png", + "gif", + "webp", + "svg", + "ico", + "bmp", + "tiff", + "psd", + # Fonts + "woff", + "woff2", + "ttf", + "otf", + "eot", + # Styles + # 'css', 'less', 'scss', 'sass', + # Media + "mp4", + "webm", + "ogg", + "avi", + "mov", + "wmv", + "flv", + "m4v", + "mp3", + "wav", + "aac", + "m4a", + "opus", + "flac", + # Documents + "pdf", + "doc", + "docx", + "xls", + "xlsx", + "ppt", + "pptx", + # Archives + "zip", + "rar", + "7z", + "tar", + "gz", + # Scripts and data + "xml", + "swf", + "wasm", + ] + + # Common context settings + context_settings = { + "user_agent": user_agent, + "viewport": viewport_settings, + "proxy": proxy_settings, + "accept_downloads": self.config.accept_downloads, + "storage_state": self.config.storage_state, + "ignore_https_errors": self.config.ignore_https_errors, + "device_scale_factor": 1.0, + "java_script_enabled": self.config.java_script_enabled, + } + + if crawlerRunConfig: + # Check if there is value for crawlerRunConfig.proxy_config set add that to context + if crawlerRunConfig.proxy_config: + proxy_settings = { + "server": crawlerRunConfig.proxy_config.get("server"), + } + if crawlerRunConfig.proxy_config.get("username"): + proxy_settings.update({ + "username": crawlerRunConfig.proxy_config.get("username"), + "password": crawlerRunConfig.proxy_config.get("password"), + }) + context_settings["proxy"] = proxy_settings + + if self.config.text_mode: + text_mode_settings = { + "has_touch": False, + "is_mobile": False, + } + # Update context settings with text mode settings + context_settings.update(text_mode_settings) + + # Create and return the context with all settings + context = await self.browser.new_context(**context_settings) + + # Apply text mode settings if enabled + if self.config.text_mode: + # Create and apply route patterns for each extension + for ext in blocked_extensions: + await context.route(f"**/*.{ext}", lambda route: route.abort()) + return context + + def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: + """ + Converts the crawlerRunConfig into a dict, excludes ephemeral fields, + then returns a hash of the sorted JSON. This yields a stable signature + that identifies configurations requiring a unique browser context. + """ + import json + + config_dict = crawlerRunConfig.__dict__.copy() + # Exclude items that do not affect browser-level setup. + # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config. + ephemeral_keys = [ + "session_id", + "js_code", + "scraping_strategy", + "extraction_strategy", + "chunking_strategy", + "cache_mode", + "content_filter", + "semaphore_count", + "url" + ] + for key in ephemeral_keys: + if key in config_dict: + del config_dict[key] + # Convert to canonical JSON string + signature_json = json.dumps(config_dict, sort_keys=True, default=str) + + # Hash the JSON so we get a compact, unique string + signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest() + return signature_hash + + async def get_page(self, crawlerRunConfig: CrawlerRunConfig): + """ + Get a page for the given session ID, creating a new one if needed. + + Args: + crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings + + Returns: + (page, context): The Page and its BrowserContext + """ + self._cleanup_expired_sessions() + + # If a session_id is provided and we already have it, reuse that page + context + if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: + context, page, _ = self.sessions[crawlerRunConfig.session_id] + # Update last-used timestamp + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + return page, context + + # If using a managed browser, just grab the shared default_context + if self.config.use_managed_browser: + context = self.default_context + page = await context.new_page() + else: + # Otherwise, check if we have an existing context for this config + config_signature = self._make_config_signature(crawlerRunConfig) + + async with self._contexts_lock: + if config_signature in self.contexts_by_config: + context = self.contexts_by_config[config_signature] + else: + # Create and setup a new context + context = await self.create_browser_context(crawlerRunConfig) + await self.setup_context(context, crawlerRunConfig) + self.contexts_by_config[config_signature] = context + + # Create a new page from the chosen context + page = await context.new_page() + + # If a session_id is specified, store this session so we can reuse later + if crawlerRunConfig.session_id: + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + + return page, context + + async def kill_session(self, session_id: str): + """ + Kill a browser session and clean up resources. + + Args: + session_id (str): The session ID to kill. + """ + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + if not self.config.use_managed_browser: + await context.close() + del self.sessions[session_id] + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + current_time = time.time() + expired_sessions = [ + sid + for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self.kill_session(sid)) + + async def close(self): + """Close all browser resources and clean up.""" + if self.config.sleep_on_close: + await asyncio.sleep(0.5) + + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self.kill_session(session_id) + + # Now close all contexts we created. This reclaims memory from ephemeral contexts. + for ctx in self.contexts_by_config.values(): + try: + await ctx.close() + except Exception as e: + self.logger.error( + message="Error closing context: {error}", + tag="ERROR", + params={"error": str(e)} + ) + self.contexts_by_config.clear() + + if self.browser: + await self.browser.close() + self.browser = None + + if self.managed_browser: + await asyncio.sleep(0.5) + await self.managed_browser.cleanup() + self.managed_browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None diff --git a/pyproject.toml b/pyproject.toml index ea6c5494..f59eabd1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,9 @@ dependencies = [ "httpx==0.27.2", "fake-useragent>=2.0.3", "click>=8.1.7", - "pyperclip>=1.8.2" + "pyperclip>=1.8.2", + "cchardet>=2.1.7", + "aiohttp>=3.11.11" ] classifiers = [ "Development Status :: 4 - Beta", diff --git a/tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py b/tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py new file mode 100644 index 00000000..262cf510 --- /dev/null +++ b/tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py @@ -0,0 +1,56 @@ +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + CrawlerRunConfig, + HTTPCrawlerConfig, + CacheMode, + DefaultMarkdownGenerator, + PruningContentFilter +) +from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy +from crawl4ai.async_logger import AsyncLogger + +async def main(): + # Initialize HTTP crawler strategy + http_strategy = AsyncHTTPCrawlerStrategy( + browser_config=HTTPCrawlerConfig( + method="GET", + verify_ssl=True, + follow_redirects=True + ), + logger=AsyncLogger(verbose=True) + ) + + # Initialize web crawler with HTTP strategy + async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler: + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter( + threshold=0.48, + threshold_type="fixed", + min_word_threshold=0 + ) + ) + ) + + # Test different URLs + urls = [ + "https://example.com", + "https://httpbin.org/get", + "raw://Test content" + ] + + for url in urls: + print(f"\n=== Testing {url} ===") + try: + result = await crawler.arun(url=url, config=crawler_config) + print(f"Status: {result.status_code}") + print(f"Raw HTML length: {len(result.html)}") + if hasattr(result, 'markdown_v2'): + print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}") + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/tests/20241401/test_http_crawler_strategy.py b/tests/20241401/test_http_crawler_strategy.py new file mode 100644 index 00000000..dc141418 --- /dev/null +++ b/tests/20241401/test_http_crawler_strategy.py @@ -0,0 +1,116 @@ +from tkinter import N +from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy +from crawl4ai.async_logger import AsyncLogger +from crawl4ai import CrawlerRunConfig, HTTPCrawlerConfig +from crawl4ai.async_crawler_strategy import ConnectionTimeoutError +import asyncio +import os + +async def main(): + """Test the AsyncHTTPCrawlerStrategy with various scenarios""" + logger = AsyncLogger(verbose=True) + + # Initialize the strategy with default HTTPCrawlerConfig + crawler = AsyncHTTPCrawlerStrategy( + browser_config=HTTPCrawlerConfig(), + logger=logger + ) + # Test 1: Basic HTTP GET + print("\n=== Test 1: Basic HTTP GET ===") + result = await crawler.crawl("https://example.com") + print(f"Status: {result.status_code}") + print(f"Content length: {len(result.html)}") + print(f"Headers: {dict(result.response_headers)}") + + # Test 2: POST request with JSON + print("\n=== Test 2: POST with JSON ===") + crawler.browser_config = crawler.browser_config.clone( + method="POST", + json={"test": "data"}, + headers={"Content-Type": "application/json"} + ) + try: + result = await crawler.crawl( + "https://httpbin.org/post", + ) + print(f"Status: {result.status_code}") + print(f"Response: {result.html[:200]}...") + except Exception as e: + print(f"Error: {e}") + + # Test 3: File handling + crawler.browser_config = HTTPCrawlerConfig() + print("\n=== Test 3: Local file handling ===") + # Create a tmp file with test content + from tempfile import NamedTemporaryFile + with NamedTemporaryFile(delete=False) as f: + f.write(b"Test content") + f.close() + result = await crawler.crawl(f"file://{f.name}") + print(f"File content: {result.html}") + + # Test 4: Raw content + print("\n=== Test 4: Raw content handling ===") + raw_html = "raw://Raw test content" + result = await crawler.crawl(raw_html) + print(f"Raw content: {result.html}") + + # Test 5: Custom hooks + print("\n=== Test 5: Custom hooks ===") + async def before_request(url, kwargs): + print(f"Before request to {url}") + kwargs['headers']['X-Custom'] = 'test' + + async def after_request(response): + print(f"After request, status: {response.status_code}") + + crawler.set_hook('before_request', before_request) + crawler.set_hook('after_request', after_request) + result = await crawler.crawl("https://example.com") + + # Test 6: Error handling + print("\n=== Test 6: Error handling ===") + try: + await crawler.crawl("https://nonexistent.domain.test") + except Exception as e: + print(f"Expected error: {e}") + + # Test 7: Redirects + print("\n=== Test 7: Redirect handling ===") + crawler.browser_config = HTTPCrawlerConfig(follow_redirects=True) + result = await crawler.crawl("http://httpbin.org/redirect/1") + print(f"Final URL: {result.redirected_url}") + + # Test 8: Custom timeout + print("\n=== Test 8: Custom timeout ===") + try: + await crawler.crawl( + "https://httpbin.org/delay/5", + config=CrawlerRunConfig(page_timeout=2) + ) + except ConnectionTimeoutError as e: + print(f"Expected timeout: {e}") + + # Test 9: SSL verification + print("\n=== Test 9: SSL verification ===") + crawler.browser_config = HTTPCrawlerConfig(verify_ssl=False) + try: + await crawler.crawl("https://expired.badssl.com/") + print("Connected to invalid SSL site with verification disabled") + except Exception as e: + print(f"SSL error: {e}") + + # Test 10: Large file streaming + print("\n=== Test 10: Large file streaming ===") + from tempfile import NamedTemporaryFile + with NamedTemporaryFile(delete=False) as f: + f.write(b"" + b"X" * 1024 * 1024 * 10 + b"") + f.close() + result = await crawler.crawl("file://" + f.name) + print(f"Large file content length: {len(result.html)}") + os.remove(f.name) + + crawler.close() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file