feat(crawler): add HTTP crawler strategy for lightweight web scraping

Implements a new AsyncHTTPCrawlerStrategy class that provides a fast, memory-efficient alternative to browser-based crawling. Features include: - Support for HTTP/HTTPS requests with configurable methods, headers, and timeouts - File and raw content handling capabilities - Streaming response processing for large files - Customizable request/response hooks - Comprehensive error handling Also refactors browser management code into separate module for better organization.
2025-02-15 19:26:30 +08:00
parent 063df572b0
commit 8bb799068e
7 changed files with 1353 additions and 851 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -2,7 +2,7 @@
 import warnings

 from .async_webcrawler import AsyncWebCrawler, CacheMode
-from .async_configs import BrowserConfig, CrawlerRunConfig
+from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
 from .content_scraping_strategy import (
    ContentScrapingStrategy,
    WebScrapingStrategy,
@@ -70,6 +70,7 @@ __all__ = [
    "LXMLWebScrapingStrategy",
    "BrowserConfig",
    "CrawlerRunConfig",
+    "HTTPCrawlerConfig",
    "ExtractionStrategy",
    "LLMExtractionStrategy",
    "CosineStrategy",
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1,5 +1,5 @@
-import re
-from attr import has
+from email import header
+from re import I
 from .config import (
    MIN_WORD_THRESHOLD,
    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -19,7 +19,6 @@ from typing import Union, List
 from .cache_context import CacheMode
 from .proxy_strategy import ProxyRotationStrategy

-
 import inspect
 from typing import Any, Dict, Optional
 from enum import Enum 
@@ -47,8 +46,8 @@ def to_serializable_dict(obj: Any) -> Dict:
    if hasattr(obj, 'isoformat'):
        return obj.isoformat()
        
-    # Handle lists, tuples, and sets
-    if isinstance(obj, (list, tuple, set)):
+    # Handle lists, tuples, and sets, and basically any iterable
+    if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__'):
        return [to_serializable_dict(item) for item in obj]
    
    # Handle frozensets, which are not iterable
@@ -67,7 +66,6 @@ def to_serializable_dict(obj: Any) -> Dict:
        # Get constructor signature
        sig = inspect.signature(obj.__class__.__init__)
        params = sig.parameters
-        _type = obj.__class__.__name__
        
        # Get current values
        current_values = {}
@@ -81,24 +79,8 @@ def to_serializable_dict(obj: Any) -> Dict:
            if not (is_empty_value(value) and is_empty_value(param.default)):
                if value != param.default:
                    current_values[name] = to_serializable_dict(value)
-                elif hasattr(obj.__class__, '__slots__') and f"_{name}" in obj.__slots__:
-                    slot = f"_{name}"
-                    slot_value = getattr(obj, slot, None)
-                    if not is_empty_value(slot_value):
-                        current_values[name] = to_serializable_dict(slot_value)
-
        
-        # # Then handle slots if present
-        # if hasattr(obj.__class__, '__slots__'):
-        #     for slot in obj.__class__.__slots__:
-        #         # Remove leading underscore if present
-        #         param_name = slot[1:] if slot.startswith('_') else slot
-                
-        #         # Get the slot value if it exists
-        #         if hasattr(obj, slot):
-        #             value = getattr(obj, slot)
-        #             if not is_empty_value(value):
-        #                 current_values[param_name] = to_serializable_dict(value)
+        _type = obj.__class__.__name__
        
        return {
            "type": obj.__class__.__name__,
@@ -126,10 +108,7 @@ def from_serializable_dict(data: Any) -> Any:
            
        # Import from crawl4ai for class instances
        import crawl4ai
-        if not hasattr(crawl4ai, data["type"]):
-            return None
-        else:
-            cls = getattr(crawl4ai, data["type"])
+        cls = getattr(crawl4ai, data["type"])
        
        # Handle Enum
        if issubclass(cls, Enum):
@@ -390,16 +369,72 @@ class BrowserConfig():
    def load( data: dict) -> "BrowserConfig":
        # Deserialize the object from a dictionary
        config = from_serializable_dict(data) 
-
-        # check if the deserialized object is an instance of BrowserConfig
        if isinstance(config, BrowserConfig):
            return config
-        elif isinstance(config, dict):
-            return BrowserConfig.from_kwargs(config)
-        else:
-            raise ValueError("Invalid data type for BrowserConfig")
+        return BrowserConfig.from_kwargs(config)


+class HTTPCrawlerConfig():
+    """HTTP-specific crawler configuration"""
+    method: str = "GET"
+    headers: Optional[Dict[str, str]] = None
+    data: Optional[Dict[str, Any]] = None
+    json: Optional[Dict[str, Any]] = None 
+    follow_redirects: bool = True
+    verify_ssl: bool = True
+
+    def __init__(self, method: str = "GET", headers: Optional[Dict[str, str]] = None, data: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, follow_redirects: bool = True, verify_ssl: bool = True):
+        self.method = method
+        self.headers = headers
+        self.data = data
+        self.json = json
+        self.follow_redirects = follow_redirects
+        self.verify_ssl = verify_ssl
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig":
+        return HTTPCrawlerConfig(
+            method=kwargs.get("method", "GET"),
+            headers=kwargs.get("headers"),
+            data=kwargs.get("data"),
+            json=kwargs.get("json"),
+            follow_redirects=kwargs.get("follow_redirects", True),
+            verify_ssl=kwargs.get("verify_ssl", True),
+        )
+
+    def to_dict(self):
+        return {
+            "method": self.method,
+            "headers": self.headers,
+            "data": self.data,
+            "json": self.json,
+            "follow_redirects": self.follow_redirects,
+            "verify_ssl": self.verify_ssl,
+        }
+    
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+        
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+            
+        Returns:
+            HTTPCrawlerConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return HTTPCrawlerConfig.from_kwargs(config_dict)
+    
+    def dump(self) -> dict:
+        return to_serializable_dict(self)
+    
+    @staticmethod
+    def load(data: dict) -> "HTTPCrawlerConfig":
+        config = from_serializable_dict(data)
+        if isinstance(config, HTTPCrawlerConfig):
+            return config
+        return HTTPCrawlerConfig.from_kwargs(config)
+
 class CrawlerRunConfig():
    """
    Configuration class for controlling how the crawler runs each crawl operation.
@@ -450,7 +485,7 @@ class CrawlerRunConfig():
        # Caching Parameters
        cache_mode (CacheMode or None): Defines how caching is handled.
                                        If None, defaults to CacheMode.ENABLED internally.
-                                        Default: None.
+                                        Default: CacheMode.BYPASS.
        session_id (str or None): Optional session ID to persist the browser context and the created
                                  page instance. If the ID already exists, the crawler does not
                                  create a new page and uses the current page to preserve the state.
@@ -543,19 +578,27 @@ class CrawlerRunConfig():
        log_console (bool): If True, log console messages from the page.
                            Default: False.

-        # Streaming Parameters
+        # HTTP Crwler Strategy Parameters
+        method (str): HTTP method to use for the request, when using AsyncHTTPCrwalerStrategy.
+                        Default: "GET".
+        data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+                        Default: None.
+        json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+                            
+        # Connection Parameters
        stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
                      Default: False.
-
-        # Optional Parameters
-        stream (bool): If True, stream the page content as it is being loaded.
-        url: str = None  # This is not a compulsory parameter
+        
        check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
-        user_agent (str): Custom User-Agent string to use. Default: None
-        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
-                                       user_agent as-is. Default: None.
+                                 Default: False.                                
+        user_agent (str): Custom User-Agent string to use. 
+                          Default: None.
+        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is. 
+                                       Default: None.
        user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
                                                    Default: None.
+        
+        url: str = None  # This is not a compulsory parameter
    """

    def __init__(
@@ -580,7 +623,7 @@ class CrawlerRunConfig():
        # SSL Parameters
        fetch_ssl_certificate: bool = False,
        # Caching Parameters
-        cache_mode: CacheMode =None,
+        cache_mode: CacheMode = CacheMode.BYPASS,
        session_id: str = None,
        bypass_cache: bool = False,
        disable_cache: bool = False,
@@ -625,7 +668,8 @@ class CrawlerRunConfig():
        # Debugging and Logging Parameters
        verbose: bool = True,
        log_console: bool = False,
-        # Streaming Parameters
+        # Connection Parameters
+        method: str = "GET",
        stream: bool = False,
        url: str = None,
        check_robots_txt: bool = False,
@@ -713,8 +757,9 @@ class CrawlerRunConfig():
        self.verbose = verbose
        self.log_console = log_console

-        # Streaming Parameters
+        # Connection Parameters
        self.stream = stream
+        self.method = method

        # Robots.txt Handling Parameters
        self.check_robots_txt = check_robots_txt
@@ -769,7 +814,7 @@ class CrawlerRunConfig():
            # SSL Parameters
            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
            # Caching Parameters
-            cache_mode=kwargs.get("cache_mode"),
+            cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
            session_id=kwargs.get("session_id"),
            bypass_cache=kwargs.get("bypass_cache", False),
            disable_cache=kwargs.get("disable_cache", False),
@@ -823,15 +868,17 @@ class CrawlerRunConfig():
            # Debugging and Logging Parameters
            verbose=kwargs.get("verbose", True),
            log_console=kwargs.get("log_console", False),
-            # Streaming Parameters
+            # Connection Parameters
+            method=kwargs.get("method", "GET"),
            stream=kwargs.get("stream", False),
-            url=kwargs.get("url"),
            check_robots_txt=kwargs.get("check_robots_txt", False),
            user_agent=kwargs.get("user_agent"),
            user_agent_mode=kwargs.get("user_agent_mode"),
            user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
            # Deep Crawl Parameters
            deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
+
+            url=kwargs.get("url"),
        )

    # Create a funciton returns dict of the object
@@ -843,13 +890,9 @@ class CrawlerRunConfig():
    def load(data: dict) -> "CrawlerRunConfig":
        # Deserialize the object from a dictionary
        config = from_serializable_dict(data) 
-        # If config type is alread instant of CrawleRunConfig, return it
        if isinstance(config, CrawlerRunConfig):
            return config
-        elif isinstance(config, dict):
-            return CrawlerRunConfig.from_kwargs(config)
-        else:
-            raise ValueError("Invalid data type")
+        return CrawlerRunConfig.from_kwargs(config)

    def to_dict(self):
        return {
@@ -910,13 +953,14 @@ class CrawlerRunConfig():
            "exclude_internal_links": self.exclude_internal_links,
            "verbose": self.verbose,
            "log_console": self.log_console,
+            "method": self.method,
            "stream": self.stream,
-            "url": self.url,
            "check_robots_txt": self.check_robots_txt,
            "user_agent": self.user_agent,
            "user_agent_mode": self.user_agent_mode,
            "user_agent_generator_config": self.user_agent_generator_config,
            "deep_crawl_strategy": self.deep_crawl_strategy,
+            "url": self.url,
        }

    def clone(self, **kwargs):
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -0,0 +1,796 @@
+import asyncio
+import time
+from typing import List, Optional
+import os
+import sys
+import shutil
+import tempfile
+import subprocess
+from playwright.async_api import BrowserContext
+import hashlib
+from .js_snippet import load_js_script
+from .config import DOWNLOAD_PAGE_TIMEOUT
+from .async_configs import BrowserConfig, CrawlerRunConfig
+from playwright_stealth import StealthConfig
+from .utils import get_chromium_path
+
+stealth_config = StealthConfig(
+    webdriver=True,
+    chrome_app=True,
+    chrome_csi=True,
+    chrome_load_times=True,
+    chrome_runtime=True,
+    navigator_languages=True,
+    navigator_plugins=True,
+    navigator_permissions=True,
+    webgl_vendor=True,
+    outerdimensions=True,
+    navigator_hardware_concurrency=True,
+    media_codecs=True,
+)
+
+BROWSER_DISABLE_OPTIONS = [
+    "--disable-background-networking",
+    "--disable-background-timer-throttling",
+    "--disable-backgrounding-occluded-windows",
+    "--disable-breakpad",
+    "--disable-client-side-phishing-detection",
+    "--disable-component-extensions-with-background-pages",
+    "--disable-default-apps",
+    "--disable-extensions",
+    "--disable-features=TranslateUI",
+    "--disable-hang-monitor",
+    "--disable-ipc-flooding-protection",
+    "--disable-popup-blocking",
+    "--disable-prompt-on-repost",
+    "--disable-sync",
+    "--force-color-profile=srgb",
+    "--metrics-recording-only",
+    "--no-first-run",
+    "--password-store=basic",
+    "--use-mock-keychain",
+]
+
+
+class ManagedBrowser:
+    """
+    Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
+
+    Attributes:
+        browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                            Default: "chromium".
+        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                     temporary directory may be used. Default: None.
+        headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                         Default: True.
+        browser_process (subprocess.Popen): The process object for the browser.
+        temp_dir (str): Temporary directory for user data if not provided.
+        debugging_port (int): Port for debugging the browser.
+        host (str): Host for debugging the browser.
+
+        Methods:
+            start(): Starts the browser process and returns the CDP endpoint URL.
+            _get_browser_path(): Returns the browser executable path based on OS and browser type.
+            _get_browser_args(): Returns browser-specific command line arguments.
+            _get_user_data_dir(): Returns the user data directory path.
+            _cleanup(): Terminates the browser process and removes the temporary directory.
+    """
+
+    browser_type: str
+    user_data_dir: str
+    headless: bool
+    browser_process: subprocess.Popen
+    temp_dir: str
+    debugging_port: int
+    host: str
+
+    def __init__(
+        self,
+        browser_type: str = "chromium",
+        user_data_dir: Optional[str] = None,
+        headless: bool = False,
+        logger=None,
+        host: str = "localhost",
+        debugging_port: int = 9222,
+        cdp_url: Optional[str] = None, 
+    ):
+        """
+        Initialize the ManagedBrowser instance.
+
+        Args:
+            browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                                Default: "chromium".
+            user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                         temporary directory may be used. Default: None.
+            headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                             Default: True.
+            logger (logging.Logger): Logger instance for logging messages. Default: None.
+            host (str): Host for debugging the browser. Default: "localhost".
+            debugging_port (int): Port for debugging the browser. Default: 9222.
+            cdp_url (str or None): CDP URL to connect to the browser. Default: None.
+        """
+        self.browser_type = browser_type
+        self.user_data_dir = user_data_dir
+        self.headless = headless
+        self.browser_process = None
+        self.temp_dir = None
+        self.debugging_port = debugging_port
+        self.host = host
+        self.logger = logger
+        self.shutting_down = False
+        self.cdp_url = cdp_url
+
+    async def start(self) -> str:
+        """
+        Starts the browser process or returns CDP endpoint URL.
+        If cdp_url is provided, returns it directly.
+        If user_data_dir is not provided for local browser, creates a temporary directory.
+        
+        Returns:
+            str: CDP endpoint URL
+        """
+        # If CDP URL provided, just return it
+        if self.cdp_url:
+            return self.cdp_url
+
+        # Create temp dir if needed
+        if not self.user_data_dir:
+            self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
+            self.user_data_dir = self.temp_dir
+
+        # Get browser path and args based on OS and browser type
+        # browser_path = self._get_browser_path()
+        args = await self._get_browser_args()
+
+        # Start browser process
+        try:
+            self.browser_process = subprocess.Popen(
+                args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            # Monitor browser process output for errors
+            asyncio.create_task(self._monitor_browser_process())
+            await asyncio.sleep(2)  # Give browser time to start
+            return f"http://{self.host}:{self.debugging_port}"
+        except Exception as e:
+            await self.cleanup()
+            raise Exception(f"Failed to start browser: {e}")
+
+    async def _monitor_browser_process(self):
+        """
+        Monitor the browser process for unexpected termination.
+
+        How it works:
+        1. Read stdout and stderr from the browser process.
+        2. If the process has terminated, log the error message and terminate the browser.
+        3. If the shutting_down flag is set, log the normal termination message.
+        4. If any other error occurs, log the error message.
+
+        Note: This method should be called in a separate task to avoid blocking the main event loop.
+        """
+        if self.browser_process:
+            try:
+                stdout, stderr = await asyncio.gather(
+                    asyncio.to_thread(self.browser_process.stdout.read),
+                    asyncio.to_thread(self.browser_process.stderr.read),
+                )
+
+                # Check shutting_down flag BEFORE logging anything
+                if self.browser_process.poll() is not None:
+                    if not self.shutting_down:
+                        self.logger.error(
+                            message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
+                            tag="ERROR",
+                            params={
+                                "code": self.browser_process.returncode,
+                                "stdout": stdout.decode(),
+                                "stderr": stderr.decode(),
+                            },
+                        )
+                        await self.cleanup()
+                    else:
+                        self.logger.info(
+                            message="Browser process terminated normally | Code: {code}",
+                            tag="INFO",
+                            params={"code": self.browser_process.returncode},
+                        )
+            except Exception as e:
+                if not self.shutting_down:
+                    self.logger.error(
+                        message="Error monitoring browser process: {error}",
+                        tag="ERROR",
+                        params={"error": str(e)},
+                    )
+
+    def _get_browser_path_WIP(self) -> str:
+        """Returns the browser executable path based on OS and browser type"""
+        if sys.platform == "darwin":  # macOS
+            paths = {
+                "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+                "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
+                "webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
+            }
+        elif sys.platform == "win32":  # Windows
+            paths = {
+                "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
+                "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
+                "webkit": None,  # WebKit not supported on Windows
+            }
+        else:  # Linux
+            paths = {
+                "chromium": "google-chrome",
+                "firefox": "firefox",
+                "webkit": None,  # WebKit not supported on Linux
+            }
+
+        return paths.get(self.browser_type)
+
+    async def _get_browser_path(self) -> str:
+        browser_path = await get_chromium_path(self.browser_type)
+        return browser_path
+
+    async def _get_browser_args(self) -> List[str]:
+        """Returns browser-specific command line arguments"""
+        base_args = [await self._get_browser_path()]
+
+        if self.browser_type == "chromium":
+            args = [
+                f"--remote-debugging-port={self.debugging_port}",
+                f"--user-data-dir={self.user_data_dir}",
+            ]
+            if self.headless:
+                args.append("--headless=new")
+        elif self.browser_type == "firefox":
+            args = [
+                "--remote-debugging-port",
+                str(self.debugging_port),
+                "--profile",
+                self.user_data_dir,
+            ]
+            if self.headless:
+                args.append("--headless")
+        else:
+            raise NotImplementedError(f"Browser type {self.browser_type} not supported")
+
+        return base_args + args
+
+    async def cleanup(self):
+        """Cleanup browser process and temporary directory"""
+        # Set shutting_down flag BEFORE any termination actions
+        self.shutting_down = True
+
+        if self.browser_process:
+            try:
+                self.browser_process.terminate()
+                # Wait for process to end gracefully
+                for _ in range(10):  # 10 attempts, 100ms each
+                    if self.browser_process.poll() is not None:
+                        break
+                    await asyncio.sleep(0.1)
+
+                # Force kill if still running
+                if self.browser_process.poll() is None:
+                    self.browser_process.kill()
+                    await asyncio.sleep(0.1)  # Brief wait for kill to take effect
+
+            except Exception as e:
+                self.logger.error(
+                    message="Error terminating browser: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)},
+                )
+
+        if self.temp_dir and os.path.exists(self.temp_dir):
+            try:
+                shutil.rmtree(self.temp_dir)
+            except Exception as e:
+                self.logger.error(
+                    message="Error removing temporary directory: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)},
+                )
+
+
+class BrowserManager:
+    """
+    Manages the browser instance and context.
+
+    Attributes:
+        config (BrowserConfig): Configuration object containing all browser settings
+        logger: Logger instance for recording events and errors
+        browser (Browser): The browser instance
+        default_context (BrowserContext): The default browser context
+        managed_browser (ManagedBrowser): The managed browser instance
+        playwright (Playwright): The Playwright instance
+        sessions (dict): Dictionary to store session information
+        session_ttl (int): Session timeout in seconds
+    """
+
+    def __init__(self, browser_config: BrowserConfig, logger=None):
+        """
+        Initialize the BrowserManager with a browser configuration.
+
+        Args:
+            browser_config (BrowserConfig): Configuration object containing all browser settings
+            logger: Logger instance for recording events and errors
+        """
+        self.config: BrowserConfig = browser_config
+        self.logger = logger
+
+        # Browser state
+        self.browser = None
+        self.default_context = None
+        self.managed_browser = None
+        self.playwright = None
+
+        # Session management
+        self.sessions = {}
+        self.session_ttl = 1800  # 30 minutes
+
+        # Keep track of contexts by a "config signature," so each unique config reuses a single context
+        self.contexts_by_config = {}
+        self._contexts_lock = asyncio.Lock() 
+
+        # Initialize ManagedBrowser if needed
+        if self.config.use_managed_browser:
+            self.managed_browser = ManagedBrowser(
+                browser_type=self.config.browser_type,
+                user_data_dir=self.config.user_data_dir,
+                headless=self.config.headless,
+                logger=self.logger,
+                debugging_port=self.config.debugging_port,
+            )
+
+    async def start(self):
+        """
+        Start the browser instance and set up the default context.
+
+        How it works:
+        1. Check if Playwright is already initialized.
+        2. If not, initialize Playwright.
+        3. If managed browser is used, start it and connect to the CDP endpoint.
+        4. If managed browser is not used, launch the browser and set up the default context.
+
+        Note: This method should be called in a separate task to avoid blocking the main event loop.
+        """
+        if self.playwright is None:
+            from playwright.async_api import async_playwright
+
+            self.playwright = await async_playwright().start()
+
+        if self.config.use_managed_browser:
+            cdp_url = await self.managed_browser.start()
+            self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
+            contexts = self.browser.contexts
+            if contexts:
+                self.default_context = contexts[0]
+            else:
+                self.default_context = await self.create_browser_context()
+                # self.default_context = await self.browser.new_context(
+                #     viewport={
+                #         "width": self.config.viewport_width,
+                #         "height": self.config.viewport_height,
+                #     },
+                #     storage_state=self.config.storage_state,
+                #     user_agent=self.config.headers.get(
+                #         "User-Agent", self.config.user_agent
+                #     ),
+                #     accept_downloads=self.config.accept_downloads,
+                #     ignore_https_errors=self.config.ignore_https_errors,
+                #     java_script_enabled=self.config.java_script_enabled,
+                # )
+            await self.setup_context(self.default_context)
+        else:
+            browser_args = self._build_browser_args()
+
+            # Launch appropriate browser type
+            if self.config.browser_type == "firefox":
+                self.browser = await self.playwright.firefox.launch(**browser_args)
+            elif self.config.browser_type == "webkit":
+                self.browser = await self.playwright.webkit.launch(**browser_args)
+            else:
+                self.browser = await self.playwright.chromium.launch(**browser_args)
+
+            self.default_context = self.browser
+
+    def _build_browser_args(self) -> dict:
+        """Build browser launch arguments from config."""
+        args = [
+            "--disable-gpu",
+            "--disable-gpu-compositing",
+            "--disable-software-rasterizer",
+            "--no-sandbox",
+            "--disable-dev-shm-usage",
+            "--no-first-run",
+            "--no-default-browser-check",
+            "--disable-infobars",
+            "--window-position=0,0",
+            "--ignore-certificate-errors",
+            "--ignore-certificate-errors-spki-list",
+            "--disable-blink-features=AutomationControlled",
+            "--window-position=400,0",
+            "--disable-renderer-backgrounding",
+            "--disable-ipc-flooding-protection",
+            "--force-color-profile=srgb",
+            "--mute-audio",
+            "--disable-background-timer-throttling",
+            # "--single-process",
+            f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
+        ]
+
+        if self.config.light_mode:
+            args.extend(BROWSER_DISABLE_OPTIONS)
+
+        if self.config.text_mode:
+            args.extend(
+                [
+                    "--blink-settings=imagesEnabled=false",
+                    "--disable-remote-fonts",
+                    "--disable-images",
+                    "--disable-javascript",
+                    "--disable-software-rasterizer",
+                    "--disable-dev-shm-usage",
+                ]
+            )
+
+        if self.config.extra_args:
+            args.extend(self.config.extra_args)
+
+        browser_args = {"headless": self.config.headless, "args": args}
+
+        if self.config.chrome_channel:
+            browser_args["channel"] = self.config.chrome_channel
+
+        if self.config.accept_downloads:
+            browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
+                os.getcwd(), "downloads"
+            )
+            os.makedirs(browser_args["downloads_path"], exist_ok=True)
+
+        if self.config.proxy or self.config.proxy_config:
+            from playwright.async_api import ProxySettings
+
+            proxy_settings = (
+                ProxySettings(server=self.config.proxy)
+                if self.config.proxy
+                else ProxySettings(
+                    server=self.config.proxy_config.get("server"),
+                    username=self.config.proxy_config.get("username"),
+                    password=self.config.proxy_config.get("password"),
+                )
+            )
+            browser_args["proxy"] = proxy_settings
+
+        return browser_args
+
+    async def setup_context(
+        self,
+        context: BrowserContext,
+        crawlerRunConfig: CrawlerRunConfig = None,
+        is_default=False,
+    ):
+        """
+        Set up a browser context with the configured options.
+
+        How it works:
+        1. Set extra HTTP headers if provided.
+        2. Add cookies if provided.
+        3. Load storage state if provided.
+        4. Accept downloads if enabled.
+        5. Set default timeouts for navigation and download.
+        6. Set user agent if provided.
+        7. Set browser hints if provided.
+        8. Set proxy if provided.
+        9. Set downloads path if provided.
+        10. Set storage state if provided.
+        11. Set cache if provided.
+        12. Set extra HTTP headers if provided.
+        13. Add cookies if provided.
+        14. Set default timeouts for navigation and download if enabled.
+        15. Set user agent if provided.
+        16. Set browser hints if provided.
+
+        Args:
+            context (BrowserContext): The browser context to set up
+            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
+            is_default (bool): Flag indicating if this is the default context
+        Returns:
+            None
+        """
+        if self.config.headers:
+            await context.set_extra_http_headers(self.config.headers)
+
+        if self.config.cookies:
+            await context.add_cookies(self.config.cookies)
+
+        if self.config.storage_state:
+            await context.storage_state(path=None)
+
+        if self.config.accept_downloads:
+            context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
+            context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
+            if self.config.downloads_path:
+                context._impl_obj._options["accept_downloads"] = True
+                context._impl_obj._options[
+                    "downloads_path"
+                ] = self.config.downloads_path
+
+        # Handle user agent and browser hints
+        if self.config.user_agent:
+            combined_headers = {
+                "User-Agent": self.config.user_agent,
+                "sec-ch-ua": self.config.browser_hint,
+            }
+            combined_headers.update(self.config.headers)
+            await context.set_extra_http_headers(combined_headers)
+
+        # Add default cookie
+        await context.add_cookies(
+            [
+                {
+                    "name": "cookiesEnabled",
+                    "value": "true",
+                    "url": crawlerRunConfig.url
+                    if crawlerRunConfig
+                    else "https://crawl4ai.com/",
+                }
+            ]
+        )
+
+        # Handle navigator overrides
+        if crawlerRunConfig:
+            if (
+                crawlerRunConfig.override_navigator
+                or crawlerRunConfig.simulate_user
+                or crawlerRunConfig.magic
+            ):
+                await context.add_init_script(load_js_script("navigator_overrider"))        
+
+    async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
+        """
+        Creates and returns a new browser context with configured settings.
+        Applies text-only mode settings if text_mode is enabled in config.
+
+        Returns:
+            Context: Browser context object with the specified configurations
+        """
+        # Base settings
+        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
+        viewport_settings = {
+            "width": self.config.viewport_width,
+            "height": self.config.viewport_height,
+        }
+        proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
+
+        blocked_extensions = [
+            # Images
+            "jpg",
+            "jpeg",
+            "png",
+            "gif",
+            "webp",
+            "svg",
+            "ico",
+            "bmp",
+            "tiff",
+            "psd",
+            # Fonts
+            "woff",
+            "woff2",
+            "ttf",
+            "otf",
+            "eot",
+            # Styles
+            # 'css', 'less', 'scss', 'sass',
+            # Media
+            "mp4",
+            "webm",
+            "ogg",
+            "avi",
+            "mov",
+            "wmv",
+            "flv",
+            "m4v",
+            "mp3",
+            "wav",
+            "aac",
+            "m4a",
+            "opus",
+            "flac",
+            # Documents
+            "pdf",
+            "doc",
+            "docx",
+            "xls",
+            "xlsx",
+            "ppt",
+            "pptx",
+            # Archives
+            "zip",
+            "rar",
+            "7z",
+            "tar",
+            "gz",
+            # Scripts and data
+            "xml",
+            "swf",
+            "wasm",
+        ]
+
+        # Common context settings
+        context_settings = {
+            "user_agent": user_agent,
+            "viewport": viewport_settings,
+            "proxy": proxy_settings,
+            "accept_downloads": self.config.accept_downloads,
+            "storage_state": self.config.storage_state,
+            "ignore_https_errors": self.config.ignore_https_errors,
+            "device_scale_factor": 1.0,
+            "java_script_enabled": self.config.java_script_enabled,
+        }
+        
+        if crawlerRunConfig:
+            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
+            if crawlerRunConfig.proxy_config:
+                proxy_settings = {
+                    "server": crawlerRunConfig.proxy_config.get("server"),
+                }
+                if crawlerRunConfig.proxy_config.get("username"):
+                    proxy_settings.update({
+                        "username": crawlerRunConfig.proxy_config.get("username"),
+                        "password": crawlerRunConfig.proxy_config.get("password"),
+                    })
+                context_settings["proxy"] = proxy_settings
+
+        if self.config.text_mode:
+            text_mode_settings = {
+                "has_touch": False,
+                "is_mobile": False,
+            }
+            # Update context settings with text mode settings
+            context_settings.update(text_mode_settings)
+
+        # Create and return the context with all settings
+        context = await self.browser.new_context(**context_settings)
+
+        # Apply text mode settings if enabled
+        if self.config.text_mode:
+            # Create and apply route patterns for each extension
+            for ext in blocked_extensions:
+                await context.route(f"**/*.{ext}", lambda route: route.abort())
+        return context
+
+    def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
+        """
+        Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
+        then returns a hash of the sorted JSON. This yields a stable signature
+        that identifies configurations requiring a unique browser context.
+        """
+        import json
+
+        config_dict = crawlerRunConfig.__dict__.copy()
+        # Exclude items that do not affect browser-level setup.
+        # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
+        ephemeral_keys = [
+            "session_id",
+            "js_code",
+            "scraping_strategy",
+            "extraction_strategy",
+            "chunking_strategy",
+            "cache_mode",
+            "content_filter",
+            "semaphore_count",
+            "url"
+        ]
+        for key in ephemeral_keys:
+            if key in config_dict:
+                del config_dict[key]
+        # Convert to canonical JSON string
+        signature_json = json.dumps(config_dict, sort_keys=True, default=str)
+
+        # Hash the JSON so we get a compact, unique string
+        signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
+        return signature_hash
+
+    async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
+        """
+        Get a page for the given session ID, creating a new one if needed.
+
+        Args:
+            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
+
+        Returns:
+            (page, context): The Page and its BrowserContext
+        """
+        self._cleanup_expired_sessions()
+
+        # If a session_id is provided and we already have it, reuse that page + context
+        if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
+            context, page, _ = self.sessions[crawlerRunConfig.session_id]
+            # Update last-used timestamp
+            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
+            return page, context
+
+        # If using a managed browser, just grab the shared default_context
+        if self.config.use_managed_browser:
+            context = self.default_context
+            page = await context.new_page()
+        else:
+            # Otherwise, check if we have an existing context for this config
+            config_signature = self._make_config_signature(crawlerRunConfig)
+
+            async with self._contexts_lock:
+                if config_signature in self.contexts_by_config:
+                    context = self.contexts_by_config[config_signature]
+                else:
+                    # Create and setup a new context
+                    context = await self.create_browser_context(crawlerRunConfig)
+                    await self.setup_context(context, crawlerRunConfig)
+                    self.contexts_by_config[config_signature] = context
+
+            # Create a new page from the chosen context
+            page = await context.new_page()
+
+        # If a session_id is specified, store this session so we can reuse later
+        if crawlerRunConfig.session_id:
+            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
+
+        return page, context
+
+    async def kill_session(self, session_id: str):
+        """
+        Kill a browser session and clean up resources.
+
+        Args:
+            session_id (str): The session ID to kill.
+        """
+        if session_id in self.sessions:
+            context, page, _ = self.sessions[session_id]
+            await page.close()
+            if not self.config.use_managed_browser:
+                await context.close()
+            del self.sessions[session_id]
+
+    def _cleanup_expired_sessions(self):
+        """Clean up expired sessions based on TTL."""
+        current_time = time.time()
+        expired_sessions = [
+            sid
+            for sid, (_, _, last_used) in self.sessions.items()
+            if current_time - last_used > self.session_ttl
+        ]
+        for sid in expired_sessions:
+            asyncio.create_task(self.kill_session(sid))
+
+    async def close(self):
+        """Close all browser resources and clean up."""
+        if self.config.sleep_on_close:
+            await asyncio.sleep(0.5)
+
+        session_ids = list(self.sessions.keys())
+        for session_id in session_ids:
+            await self.kill_session(session_id)
+
+        # Now close all contexts we created. This reclaims memory from ephemeral contexts.
+        for ctx in self.contexts_by_config.values():
+            try:
+                await ctx.close()
+            except Exception as e:
+                self.logger.error(
+                    message="Error closing context: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)}
+                )
+        self.contexts_by_config.clear()
+
+        if self.browser:
+            await self.browser.close()
+            self.browser = None
+
+        if self.managed_browser:
+            await asyncio.sleep(0.5)
+            await self.managed_browser.cleanup()
+            self.managed_browser = None
+
+        if self.playwright:
+            await self.playwright.stop()
+            self.playwright = None
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,9 @@ dependencies = [
    "httpx==0.27.2",
    "fake-useragent>=2.0.3",
    "click>=8.1.7",
-    "pyperclip>=1.8.2"
+    "pyperclip>=1.8.2",
+    "cchardet>=2.1.7",
+    "aiohttp>=3.11.11"
 ]
 classifiers = [
    "Development Status :: 4 - Beta",
--- a/tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py
+++ b/tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py
@@ -0,0 +1,56 @@
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    HTTPCrawlerConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter
+)
+from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
+from crawl4ai.async_logger import AsyncLogger
+
+async def main():
+    # Initialize HTTP crawler strategy
+    http_strategy = AsyncHTTPCrawlerStrategy(
+        browser_config=HTTPCrawlerConfig(
+            method="GET",
+            verify_ssl=True,
+            follow_redirects=True
+        ),
+        logger=AsyncLogger(verbose=True)
+    )
+
+    # Initialize web crawler with HTTP strategy
+    async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(
+                    threshold=0.48, 
+                    threshold_type="fixed", 
+                    min_word_threshold=0
+                )
+            )
+        )
+        
+        # Test different URLs
+        urls = [
+            "https://example.com",
+            "https://httpbin.org/get",
+            "raw://<html><body>Test content</body></html>"
+        ]
+        
+        for url in urls:
+            print(f"\n=== Testing {url} ===")
+            try:
+                result = await crawler.arun(url=url, config=crawler_config)
+                print(f"Status: {result.status_code}")
+                print(f"Raw HTML length: {len(result.html)}")
+                if hasattr(result, 'markdown_v2'):
+                    print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
+            except Exception as e:
+                print(f"Error: {e}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/20241401/test_http_crawler_strategy.py
+++ b/tests/20241401/test_http_crawler_strategy.py
@@ -0,0 +1,116 @@
+from tkinter import N
+from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai import CrawlerRunConfig, HTTPCrawlerConfig
+from crawl4ai.async_crawler_strategy import ConnectionTimeoutError
+import asyncio
+import os
+
+async def main():
+    """Test the AsyncHTTPCrawlerStrategy with various scenarios"""
+    logger = AsyncLogger(verbose=True)
+
+    # Initialize the strategy with default HTTPCrawlerConfig
+    crawler = AsyncHTTPCrawlerStrategy(
+        browser_config=HTTPCrawlerConfig(),
+        logger=logger
+    )
+    # Test 1: Basic HTTP GET
+    print("\n=== Test 1: Basic HTTP GET ===")
+    result = await crawler.crawl("https://example.com")
+    print(f"Status: {result.status_code}")
+    print(f"Content length: {len(result.html)}")
+    print(f"Headers: {dict(result.response_headers)}")
+
+    # Test 2: POST request with JSON
+    print("\n=== Test 2: POST with JSON ===")
+    crawler.browser_config = crawler.browser_config.clone(
+        method="POST",
+        json={"test": "data"},
+        headers={"Content-Type": "application/json"}
+    )
+    try:
+        result = await crawler.crawl(
+            "https://httpbin.org/post",
+        )
+        print(f"Status: {result.status_code}")
+        print(f"Response: {result.html[:200]}...")
+    except Exception as e:
+        print(f"Error: {e}")
+
+    # Test 3: File handling
+    crawler.browser_config = HTTPCrawlerConfig()
+    print("\n=== Test 3: Local file handling ===")
+    # Create a tmp file with test content
+    from tempfile import NamedTemporaryFile
+    with NamedTemporaryFile(delete=False) as f:
+        f.write(b"<html><body>Test content</body></html>")
+        f.close()
+        result = await crawler.crawl(f"file://{f.name}")
+        print(f"File content: {result.html}")
+
+    # Test 4: Raw content
+    print("\n=== Test 4: Raw content handling ===")
+    raw_html = "raw://<html><body>Raw test content</body></html>"
+    result = await crawler.crawl(raw_html)
+    print(f"Raw content: {result.html}")
+
+    # Test 5: Custom hooks
+    print("\n=== Test 5: Custom hooks ===")
+    async def before_request(url, kwargs):
+        print(f"Before request to {url}")
+        kwargs['headers']['X-Custom'] = 'test'
+
+    async def after_request(response):
+        print(f"After request, status: {response.status_code}")
+
+    crawler.set_hook('before_request', before_request)
+    crawler.set_hook('after_request', after_request)
+    result = await crawler.crawl("https://example.com")
+
+    # Test 6: Error handling
+    print("\n=== Test 6: Error handling ===")
+    try:
+        await crawler.crawl("https://nonexistent.domain.test")
+    except Exception as e:
+        print(f"Expected error: {e}")
+
+    # Test 7: Redirects
+    print("\n=== Test 7: Redirect handling ===")
+    crawler.browser_config = HTTPCrawlerConfig(follow_redirects=True)
+    result = await crawler.crawl("http://httpbin.org/redirect/1")
+    print(f"Final URL: {result.redirected_url}")
+
+    # Test 8: Custom timeout
+    print("\n=== Test 8: Custom timeout ===")
+    try:
+        await crawler.crawl(
+            "https://httpbin.org/delay/5",
+            config=CrawlerRunConfig(page_timeout=2)
+        )
+    except ConnectionTimeoutError as e:
+        print(f"Expected timeout: {e}")
+
+    # Test 9: SSL verification
+    print("\n=== Test 9: SSL verification ===")
+    crawler.browser_config = HTTPCrawlerConfig(verify_ssl=False)
+    try:
+        await crawler.crawl("https://expired.badssl.com/")
+        print("Connected to invalid SSL site with verification disabled")
+    except Exception as e:
+        print(f"SSL error: {e}")
+
+    # Test 10: Large file streaming
+    print("\n=== Test 10: Large file streaming ===")
+    from tempfile import NamedTemporaryFile
+    with NamedTemporaryFile(delete=False) as f:
+        f.write(b"<html><body>" + b"X" * 1024 * 1024 * 10 + b"</body></html>")
+        f.close()
+        result = await crawler.crawl("file://" + f.name)
+        print(f"Large file content length: {len(result.html)}")
+        os.remove(f.name)
+
+    crawler.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())