feat: Add URL-specific crawler configurations for multi-URL crawling

Implement dynamic configuration selection based on URL patterns to optimize crawling for different content types. This feature enables users to apply different crawling strategies (PDF extraction, content filtering, JavaScript execution) based on URL matching patterns. Key additions: - Add url_matcher and match_mode parameters to CrawlerRunConfig - Implement is_match() method supporting string patterns, functions, and mixed lists - Add MatchMode enum for OR/AND logic when combining multiple matchers - Update AsyncWebCrawler.arun_many() to accept List[CrawlerRunConfig] - Add select_config() method to dispatchers for runtime config selection - First matching config wins, with fallback to default Pattern matching supports: - Glob-style strings: *.pdf, */blog/*, *api* - Lambda functions: lambda url: 'github.com' in url - Mixed patterns with AND/OR logic for complex matching This enables optimal per-URL configuration: - PDFs: Use PDFContentScrapingStrategy without JavaScript - Blogs: Apply content filtering to reduce noise - APIs: Skip JavaScript, use JSON extraction - Dynamic sites: Execute only necessary JavaScript Breaking changes: None - fully backward compatible
2025-08-02 19:10:36 +08:00
parent 864d87afb2
commit a03e68fa2f
13 changed files with 1096 additions and 20 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -3,7 +3,7 @@ import warnings

 from .async_webcrawler import AsyncWebCrawler, CacheMode
 # MODIFIED: Add SeedingConfig and VirtualScrollConfig here
-from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig
+from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode

 from .content_scraping_strategy import (
    ContentScrapingStrategy,
@@ -132,6 +132,7 @@ __all__ = [
    "CrawlResult",
    "CrawlerHub",
    "CacheMode",
+    "MatchMode",
    "ContentScrapingStrategy",
    "WebScrapingStrategy",
    "LXMLWebScrapingStrategy",
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -24,11 +24,18 @@ from .deep_crawling import DeepCrawlStrategy
 from .cache_context import CacheMode
 from .proxy_strategy import ProxyRotationStrategy

-from typing import Union, List
+from typing import Union, List, Callable
 import inspect
 from typing import Any, Dict, Optional
 from enum import Enum

+# Type alias for URL matching
+UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
+
+class MatchMode(Enum):
+    OR = "or"
+    AND = "and"
+
 # from .proxy_strategy import ProxyConfig


@@ -1113,6 +1120,9 @@ class CrawlerRunConfig():
        link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
        # Virtual Scroll Parameters
        virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None,
+        # URL Matching Parameters
+        url_matcher: Optional[UrlMatcher] = None,
+        match_mode: MatchMode = MatchMode.OR,
        # Experimental Parameters
        experimental: Dict[str, Any] = None,
    ):
@@ -1266,6 +1276,10 @@ class CrawlerRunConfig():
        else:
            raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict")
        
+        # URL Matching Parameters
+        self.url_matcher = url_matcher
+        self.match_mode = match_mode
+        
        # Experimental Parameters
        self.experimental = experimental or {}
        
@@ -1321,6 +1335,51 @@ class CrawlerRunConfig():
            if "compilation error" not in str(e).lower():
                raise ValueError(f"Failed to compile C4A script: {str(e)}")
            raise
+    
+    def is_match(self, url: str) -> bool:
+        """Check if this config matches the given URL.
+        
+        Args:
+            url: The URL to check against this config's matcher
+            
+        Returns:
+            bool: True if this config should be used for the URL
+        """
+        if self.url_matcher is None:
+            return False
+            
+        if callable(self.url_matcher):
+            # Single function matcher
+            return self.url_matcher(url)
+        
+        elif isinstance(self.url_matcher, str):
+            # Single pattern string
+            from fnmatch import fnmatch
+            return fnmatch(url, self.url_matcher)
+        
+        elif isinstance(self.url_matcher, list):
+            # List of mixed matchers
+            if not self.url_matcher:  # Empty list
+                return False
+                
+            results = []
+            for matcher in self.url_matcher:
+                if callable(matcher):
+                    results.append(matcher(url))
+                elif isinstance(matcher, str):
+                    from fnmatch import fnmatch
+                    results.append(fnmatch(url, matcher))
+                else:
+                    # Skip invalid matchers
+                    continue
+            
+            # Apply match mode logic
+            if self.match_mode == MatchMode.OR:
+                return any(results) if results else False
+            else:  # AND mode
+                return all(results) if results else False
+        
+        return False


    def __getattr__(self, name):
@@ -1443,6 +1502,9 @@ class CrawlerRunConfig():
            # Link Extraction Parameters
            link_preview_config=kwargs.get("link_preview_config"),
            url=kwargs.get("url"),
+            # URL Matching Parameters
+            url_matcher=kwargs.get("url_matcher"),
+            match_mode=kwargs.get("match_mode", MatchMode.OR),
            # Experimental Parameters 
            experimental=kwargs.get("experimental"),
        )
@@ -1540,6 +1602,8 @@ class CrawlerRunConfig():
            "deep_crawl_strategy": self.deep_crawl_strategy,
            "link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None,
            "url": self.url,
+            "url_matcher": self.url_matcher,
+            "match_mode": self.match_mode,
            "experimental": self.experimental,
        }

--- a/crawl4ai/async_dispatcher.py
+++ b/crawl4ai/async_dispatcher.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, List, Tuple
+from typing import Dict, Optional, List, Tuple, Union
 from .async_configs import CrawlerRunConfig
 from .models import (
    CrawlResult,
@@ -96,11 +96,37 @@ class BaseDispatcher(ABC):
        self.rate_limiter = rate_limiter
        self.monitor = monitor

+    def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> CrawlerRunConfig:
+        """Select the appropriate config for a given URL.
+        
+        Args:
+            url: The URL to match against
+            configs: Single config or list of configs to choose from
+            
+        Returns:
+            The matching config, or the first config if no match, or a default config if empty list
+        """
+        # Single config - return as is
+        if isinstance(configs, CrawlerRunConfig):
+            return configs
+        
+        # Empty list - return default config
+        if not configs:
+            return CrawlerRunConfig()
+        
+        # Find first matching config
+        for config in configs:
+            if config.is_match(url):
+                return config
+        
+        # No match found - return first config as fallback
+        return configs[0]
+
    @abstractmethod
    async def crawl_url(
        self,
        url: str,
-        config: CrawlerRunConfig,
+        config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
        task_id: str,
        monitor: Optional[CrawlerMonitor] = None,
    ) -> CrawlerTaskResult:
@@ -111,7 +137,7 @@ class BaseDispatcher(ABC):
        self,
        urls: List[str],
        crawler: AsyncWebCrawler,  # noqa: F821
-        config: CrawlerRunConfig,
+        config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
        monitor: Optional[CrawlerMonitor] = None,
    ) -> List[CrawlerTaskResult]:
        pass
@@ -200,7 +226,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
    async def crawl_url(
        self,
        url: str,
-        config: CrawlerRunConfig,
+        config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
        task_id: str,
        retry_count: int = 0,
    ) -> CrawlerTaskResult:
@@ -208,6 +234,9 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
        error_message = ""
        memory_usage = peak_memory = 0.0
        
+        # Select appropriate config for this URL
+        selected_config = self.select_config(url, config)
+        
        # Get starting memory for accurate measurement
        process = psutil.Process()
        start_memory = process.memory_info().rss / (1024 * 1024)
@@ -257,8 +286,8 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
                    retry_count=retry_count + 1
                )
            
-            # Execute the crawl
-            result = await self.crawler.arun(url, config=config, session_id=task_id)
+            # Execute the crawl with selected config
+            result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
            
            # Measure memory usage
            end_memory = process.memory_info().rss / (1024 * 1024)
@@ -316,7 +345,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
        self,
        urls: List[str],
        crawler: AsyncWebCrawler,
-        config: CrawlerRunConfig,
+        config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
    ) -> List[CrawlerTaskResult]:
        self.crawler = crawler
        
@@ -470,7 +499,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
        self,
        urls: List[str],
        crawler: AsyncWebCrawler,
-        config: CrawlerRunConfig,
+        config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
    ) -> AsyncGenerator[CrawlerTaskResult, None]:
        self.crawler = crawler
        
@@ -572,7 +601,7 @@ class SemaphoreDispatcher(BaseDispatcher):
    async def crawl_url(
        self,
        url: str,
-        config: CrawlerRunConfig,
+        config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
        task_id: str,
        semaphore: asyncio.Semaphore = None,
    ) -> CrawlerTaskResult:
@@ -580,6 +609,9 @@ class SemaphoreDispatcher(BaseDispatcher):
        error_message = ""
        memory_usage = peak_memory = 0.0

+        # Select appropriate config for this URL
+        selected_config = self.select_config(url, config)
+
        try:
            if self.monitor:
                self.monitor.update_task(
@@ -592,7 +624,7 @@ class SemaphoreDispatcher(BaseDispatcher):
            async with semaphore:
                process = psutil.Process()
                start_memory = process.memory_info().rss / (1024 * 1024)
-                result = await self.crawler.arun(url, config=config, session_id=task_id)
+                result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
                end_memory = process.memory_info().rss / (1024 * 1024)

                memory_usage = peak_memory = end_memory - start_memory
@@ -654,7 +686,7 @@ class SemaphoreDispatcher(BaseDispatcher):
        self,
        crawler: AsyncWebCrawler,  # noqa: F821
        urls: List[str],
-        config: CrawlerRunConfig,
+        config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
    ) -> List[CrawlerTaskResult]:
        self.crawler = crawler
        if self.monitor:
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -653,7 +653,7 @@ class AsyncWebCrawler:
    async def arun_many(
        self,
        urls: List[str],
-        config: Optional[CrawlerRunConfig] = None,
+        config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None,
        dispatcher: Optional[BaseDispatcher] = None,
        # Legacy parameters maintained for backwards compatibility
        # word_count_threshold=MIN_WORD_THRESHOLD,
@@ -674,7 +674,9 @@ class AsyncWebCrawler:

        Args:
        urls: List of URLs to crawl
-        config: Configuration object controlling crawl behavior for all URLs
+        config: Configuration object(s) controlling crawl behavior. Can be:
+            - Single CrawlerRunConfig: Used for all URLs
+            - List[CrawlerRunConfig]: Configs with url_matcher for URL-specific settings
        dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
        [other parameters maintained for backwards compatibility]

@@ -739,7 +741,11 @@ class AsyncWebCrawler:
                or task_result.result
            )

-        stream = config.stream
+        # Handle stream setting - use first config's stream setting if config is a list
+        if isinstance(config, list):
+            stream = config[0].stream if config else False
+        else:
+            stream = config.stream

        if stream: