fix: Correct URL matcher fallback behavior and improve memory monitoring

Fix critical issue where unmatched URLs incorrectly used the first config instead of failing safely. Also clarify that configs without url_matcher match ALL URLs by design, and improve memory usage monitoring. Bug fixes: - Change select_config() to return None when no config matches instead of using first config - Add proper error handling in dispatchers when no config matches a URL - Return failed CrawlResult with "No matching configuration found" error message - Fix is_match() to return True when url_matcher is None (matches all URLs) - Import and use get_true_memory_usage_percent() for more accurate memory monitoring Behavior clarification: - CrawlerRunConfig with url_matcher=None matches ALL URLs (not nothing) - This is the intended behavior for default/fallback configurations - Enables clean pattern: specific configs first, default config last Documentation updates: - Clarify that configs without url_matcher match everything - Explain "No matching configuration found" error when no default config - Add examples showing proper default config usage - Update all relevant docs: multi-url-crawling.md, arun_many.md, parameters.md - Simplify API config examples by removing extraction_strategy Demo and test updates: - Update demo_multi_config_clean.py with commented default config to show behavior - Change example URL to w3schools.com to demonstrate no-match scenario - Uncomment all test URLs in test_multi_config.py for comprehensive testing Breaking changes: None - this restores the intended behavior This ensures URLs only get processed with appropriate configs, preventing issues like HTML pages being processed with PDF extraction strategies.
2025-08-03 16:50:54 +08:00
parent a03e68fa2f
commit 307fe28b32
9 changed files with 251 additions and 29 deletions
--- a/crawl4ai/async_dispatcher.py
+++ b/crawl4ai/async_dispatcher.py
@@ -22,6 +22,8 @@ from urllib.parse import urlparse
 import random
 from abc import ABC, abstractmethod

+from .memory_utils import get_true_memory_usage_percent
+

 class RateLimiter:
    def __init__(
@@ -96,7 +98,7 @@ class BaseDispatcher(ABC):
        self.rate_limiter = rate_limiter
        self.monitor = monitor

-    def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> CrawlerRunConfig:
+    def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> Optional[CrawlerRunConfig]:
        """Select the appropriate config for a given URL.
        
        Args:
@@ -104,23 +106,23 @@ class BaseDispatcher(ABC):
            configs: Single config or list of configs to choose from
            
        Returns:
-            The matching config, or the first config if no match, or a default config if empty list
+            The matching config, or None if no match found
        """
        # Single config - return as is
        if isinstance(configs, CrawlerRunConfig):
            return configs
        
-        # Empty list - return default config
+        # Empty list - return None
        if not configs:
-            return CrawlerRunConfig()
+            return None
        
        # Find first matching config
        for config in configs:
            if config.is_match(url):
                return config
        
-        # No match found - return first config as fallback
-        return configs[0]
+        # No match found - return None to indicate URL should be skipped
+        return None

    @abstractmethod
    async def crawl_url(
@@ -173,7 +175,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
    async def _memory_monitor_task(self):
        """Background task to continuously monitor memory usage and update state"""
        while True:
-            self.current_memory_percent = psutil.virtual_memory().percent
+            self.current_memory_percent = get_true_memory_usage_percent()

            # Enter memory pressure mode if we cross the threshold
            if self.current_memory_percent >= self.memory_threshold_percent:
@@ -237,6 +239,34 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
        # Select appropriate config for this URL
        selected_config = self.select_config(url, config)
        
+        # If no config matches, return failed result
+        if selected_config is None:
+            error_message = f"No matching configuration found for URL: {url}"
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id, 
+                    status=CrawlStatus.FAILED,
+                    error_message=error_message
+                )
+            
+            return CrawlerTaskResult(
+                task_id=task_id,
+                url=url,
+                result=CrawlResult(
+                    url=url, 
+                    html="", 
+                    metadata={"status": "no_config_match"}, 
+                    success=False, 
+                    error_message=error_message
+                ),
+                memory_usage=0,
+                peak_memory=0,
+                start_time=start_time,
+                end_time=time.time(),
+                error_message=error_message,
+                retry_count=retry_count
+            )
+        
        # Get starting memory for accurate measurement
        process = psutil.Process()
        start_memory = process.memory_info().rss / (1024 * 1024)
@@ -611,6 +641,33 @@ class SemaphoreDispatcher(BaseDispatcher):

        # Select appropriate config for this URL
        selected_config = self.select_config(url, config)
+        
+        # If no config matches, return failed result
+        if selected_config is None:
+            error_message = f"No matching configuration found for URL: {url}"
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id, 
+                    status=CrawlStatus.FAILED,
+                    error_message=error_message
+                )
+            
+            return CrawlerTaskResult(
+                task_id=task_id,
+                url=url,
+                result=CrawlResult(
+                    url=url, 
+                    html="", 
+                    metadata={"status": "no_config_match"}, 
+                    success=False, 
+                    error_message=error_message
+                ),
+                memory_usage=0,
+                peak_memory=0,
+                start_time=start_time,
+                end_time=time.time(),
+                error_message=error_message
+            )

        try:
            if self.monitor: