feat(crawler): add HTTP crawler strategy for lightweight web scraping

Implements a new AsyncHTTPCrawlerStrategy class that provides a fast, memory-efficient alternative to browser-based crawling. Features include: - Support for HTTP/HTTPS requests with configurable methods, headers, and timeouts - File and raw content handling capabilities - Streaming response processing for large files - Customizable request/response hooks - Comprehensive error handling Also refactors browser management code into separate module for better organization.
2025-02-15 19:26:30 +08:00
parent 063df572b0
commit 8bb799068e
7 changed files with 1353 additions and 851 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1,5 +1,5 @@
-import re
-from attr import has
+from email import header
+from re import I
 from .config import (
    MIN_WORD_THRESHOLD,
    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -19,7 +19,6 @@ from typing import Union, List
 from .cache_context import CacheMode
 from .proxy_strategy import ProxyRotationStrategy

-
 import inspect
 from typing import Any, Dict, Optional
 from enum import Enum 
@@ -47,8 +46,8 @@ def to_serializable_dict(obj: Any) -> Dict:
    if hasattr(obj, 'isoformat'):
        return obj.isoformat()
        
-    # Handle lists, tuples, and sets
-    if isinstance(obj, (list, tuple, set)):
+    # Handle lists, tuples, and sets, and basically any iterable
+    if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__'):
        return [to_serializable_dict(item) for item in obj]
    
    # Handle frozensets, which are not iterable
@@ -67,7 +66,6 @@ def to_serializable_dict(obj: Any) -> Dict:
        # Get constructor signature
        sig = inspect.signature(obj.__class__.__init__)
        params = sig.parameters
-        _type = obj.__class__.__name__
        
        # Get current values
        current_values = {}
@@ -81,24 +79,8 @@ def to_serializable_dict(obj: Any) -> Dict:
            if not (is_empty_value(value) and is_empty_value(param.default)):
                if value != param.default:
                    current_values[name] = to_serializable_dict(value)
-                elif hasattr(obj.__class__, '__slots__') and f"_{name}" in obj.__slots__:
-                    slot = f"_{name}"
-                    slot_value = getattr(obj, slot, None)
-                    if not is_empty_value(slot_value):
-                        current_values[name] = to_serializable_dict(slot_value)
-
        
-        # # Then handle slots if present
-        # if hasattr(obj.__class__, '__slots__'):
-        #     for slot in obj.__class__.__slots__:
-        #         # Remove leading underscore if present
-        #         param_name = slot[1:] if slot.startswith('_') else slot
-                
-        #         # Get the slot value if it exists
-        #         if hasattr(obj, slot):
-        #             value = getattr(obj, slot)
-        #             if not is_empty_value(value):
-        #                 current_values[param_name] = to_serializable_dict(value)
+        _type = obj.__class__.__name__
        
        return {
            "type": obj.__class__.__name__,
@@ -126,10 +108,7 @@ def from_serializable_dict(data: Any) -> Any:
            
        # Import from crawl4ai for class instances
        import crawl4ai
-        if not hasattr(crawl4ai, data["type"]):
-            return None
-        else:
-            cls = getattr(crawl4ai, data["type"])
+        cls = getattr(crawl4ai, data["type"])
        
        # Handle Enum
        if issubclass(cls, Enum):
@@ -390,16 +369,72 @@ class BrowserConfig():
    def load( data: dict) -> "BrowserConfig":
        # Deserialize the object from a dictionary
        config = from_serializable_dict(data) 
-
-        # check if the deserialized object is an instance of BrowserConfig
        if isinstance(config, BrowserConfig):
            return config
-        elif isinstance(config, dict):
-            return BrowserConfig.from_kwargs(config)
-        else:
-            raise ValueError("Invalid data type for BrowserConfig")
+        return BrowserConfig.from_kwargs(config)


+class HTTPCrawlerConfig():
+    """HTTP-specific crawler configuration"""
+    method: str = "GET"
+    headers: Optional[Dict[str, str]] = None
+    data: Optional[Dict[str, Any]] = None
+    json: Optional[Dict[str, Any]] = None 
+    follow_redirects: bool = True
+    verify_ssl: bool = True
+
+    def __init__(self, method: str = "GET", headers: Optional[Dict[str, str]] = None, data: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, follow_redirects: bool = True, verify_ssl: bool = True):
+        self.method = method
+        self.headers = headers
+        self.data = data
+        self.json = json
+        self.follow_redirects = follow_redirects
+        self.verify_ssl = verify_ssl
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig":
+        return HTTPCrawlerConfig(
+            method=kwargs.get("method", "GET"),
+            headers=kwargs.get("headers"),
+            data=kwargs.get("data"),
+            json=kwargs.get("json"),
+            follow_redirects=kwargs.get("follow_redirects", True),
+            verify_ssl=kwargs.get("verify_ssl", True),
+        )
+
+    def to_dict(self):
+        return {
+            "method": self.method,
+            "headers": self.headers,
+            "data": self.data,
+            "json": self.json,
+            "follow_redirects": self.follow_redirects,
+            "verify_ssl": self.verify_ssl,
+        }
+    
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+        
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+            
+        Returns:
+            HTTPCrawlerConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return HTTPCrawlerConfig.from_kwargs(config_dict)
+    
+    def dump(self) -> dict:
+        return to_serializable_dict(self)
+    
+    @staticmethod
+    def load(data: dict) -> "HTTPCrawlerConfig":
+        config = from_serializable_dict(data)
+        if isinstance(config, HTTPCrawlerConfig):
+            return config
+        return HTTPCrawlerConfig.from_kwargs(config)
+
 class CrawlerRunConfig():
    """
    Configuration class for controlling how the crawler runs each crawl operation.
@@ -450,7 +485,7 @@ class CrawlerRunConfig():
        # Caching Parameters
        cache_mode (CacheMode or None): Defines how caching is handled.
                                        If None, defaults to CacheMode.ENABLED internally.
-                                        Default: None.
+                                        Default: CacheMode.BYPASS.
        session_id (str or None): Optional session ID to persist the browser context and the created
                                  page instance. If the ID already exists, the crawler does not
                                  create a new page and uses the current page to preserve the state.
@@ -543,19 +578,27 @@ class CrawlerRunConfig():
        log_console (bool): If True, log console messages from the page.
                            Default: False.

-        # Streaming Parameters
+        # HTTP Crwler Strategy Parameters
+        method (str): HTTP method to use for the request, when using AsyncHTTPCrwalerStrategy.
+                        Default: "GET".
+        data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+                        Default: None.
+        json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+                            
+        # Connection Parameters
        stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
                      Default: False.
-
-        # Optional Parameters
-        stream (bool): If True, stream the page content as it is being loaded.
-        url: str = None  # This is not a compulsory parameter
+        
        check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
-        user_agent (str): Custom User-Agent string to use. Default: None
-        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
-                                       user_agent as-is. Default: None.
+                                 Default: False.                                
+        user_agent (str): Custom User-Agent string to use. 
+                          Default: None.
+        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is. 
+                                       Default: None.
        user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
                                                    Default: None.
+        
+        url: str = None  # This is not a compulsory parameter
    """

    def __init__(
@@ -580,7 +623,7 @@ class CrawlerRunConfig():
        # SSL Parameters
        fetch_ssl_certificate: bool = False,
        # Caching Parameters
-        cache_mode: CacheMode =None,
+        cache_mode: CacheMode = CacheMode.BYPASS,
        session_id: str = None,
        bypass_cache: bool = False,
        disable_cache: bool = False,
@@ -625,7 +668,8 @@ class CrawlerRunConfig():
        # Debugging and Logging Parameters
        verbose: bool = True,
        log_console: bool = False,
-        # Streaming Parameters
+        # Connection Parameters
+        method: str = "GET",
        stream: bool = False,
        url: str = None,
        check_robots_txt: bool = False,
@@ -713,8 +757,9 @@ class CrawlerRunConfig():
        self.verbose = verbose
        self.log_console = log_console

-        # Streaming Parameters
+        # Connection Parameters
        self.stream = stream
+        self.method = method

        # Robots.txt Handling Parameters
        self.check_robots_txt = check_robots_txt
@@ -769,7 +814,7 @@ class CrawlerRunConfig():
            # SSL Parameters
            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
            # Caching Parameters
-            cache_mode=kwargs.get("cache_mode"),
+            cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
            session_id=kwargs.get("session_id"),
            bypass_cache=kwargs.get("bypass_cache", False),
            disable_cache=kwargs.get("disable_cache", False),
@@ -823,15 +868,17 @@ class CrawlerRunConfig():
            # Debugging and Logging Parameters
            verbose=kwargs.get("verbose", True),
            log_console=kwargs.get("log_console", False),
-            # Streaming Parameters
+            # Connection Parameters
+            method=kwargs.get("method", "GET"),
            stream=kwargs.get("stream", False),
-            url=kwargs.get("url"),
            check_robots_txt=kwargs.get("check_robots_txt", False),
            user_agent=kwargs.get("user_agent"),
            user_agent_mode=kwargs.get("user_agent_mode"),
            user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
            # Deep Crawl Parameters
            deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
+
+            url=kwargs.get("url"),
        )

    # Create a funciton returns dict of the object
@@ -843,13 +890,9 @@ class CrawlerRunConfig():
    def load(data: dict) -> "CrawlerRunConfig":
        # Deserialize the object from a dictionary
        config = from_serializable_dict(data) 
-        # If config type is alread instant of CrawleRunConfig, return it
        if isinstance(config, CrawlerRunConfig):
            return config
-        elif isinstance(config, dict):
-            return CrawlerRunConfig.from_kwargs(config)
-        else:
-            raise ValueError("Invalid data type")
+        return CrawlerRunConfig.from_kwargs(config)

    def to_dict(self):
        return {
@@ -910,13 +953,14 @@ class CrawlerRunConfig():
            "exclude_internal_links": self.exclude_internal_links,
            "verbose": self.verbose,
            "log_console": self.log_console,
+            "method": self.method,
            "stream": self.stream,
-            "url": self.url,
            "check_robots_txt": self.check_robots_txt,
            "user_agent": self.user_agent,
            "user_agent_mode": self.user_agent_mode,
            "user_agent_generator_config": self.user_agent_generator_config,
            "deep_crawl_strategy": self.deep_crawl_strategy,
+            "url": self.url,
        }

    def clone(self, **kwargs):