fix: Correct URL matcher fallback behavior and improve memory monitoring

Fix critical issue where unmatched URLs incorrectly used the first config instead of failing safely. Also clarify that configs without url_matcher match ALL URLs by design, and improve memory usage monitoring. Bug fixes: - Change select_config() to return None when no config matches instead of using first config - Add proper error handling in dispatchers when no config matches a URL - Return failed CrawlResult with "No matching configuration found" error message - Fix is_match() to return True when url_matcher is None (matches all URLs) - Import and use get_true_memory_usage_percent() for more accurate memory monitoring Behavior clarification: - CrawlerRunConfig with url_matcher=None matches ALL URLs (not nothing) - This is the intended behavior for default/fallback configurations - Enables clean pattern: specific configs first, default config last Documentation updates: - Clarify that configs without url_matcher match everything - Explain "No matching configuration found" error when no default config - Add examples showing proper default config usage - Update all relevant docs: multi-url-crawling.md, arun_many.md, parameters.md - Simplify API config examples by removing extraction_strategy Demo and test updates: - Update demo_multi_config_clean.py with commented default config to show behavior - Change example URL to w3schools.com to demonstrate no-match scenario - Uncomment all test URLs in test_multi_config.py for comprehensive testing Breaking changes: None - this restores the intended behavior This ensures URLs only get processed with appropriate configs, preventing issues like HTML pages being processed with PDF extraction strategies.
2025-08-03 16:50:54 +08:00
parent a03e68fa2f
commit 307fe28b32
9 changed files with 251 additions and 29 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1343,10 +1343,10 @@ class CrawlerRunConfig():
            url: The URL to check against this config's matcher
            
        Returns:
-            bool: True if this config should be used for the URL
+            bool: True if this config should be used for the URL or if no matcher is set.
        """
        if self.url_matcher is None:
-            return False
+            return True
            
        if callable(self.url_matcher):
            # Single function matcher
--- a/crawl4ai/async_dispatcher.py
+++ b/crawl4ai/async_dispatcher.py
@@ -22,6 +22,8 @@ from urllib.parse import urlparse
 import random
 from abc import ABC, abstractmethod

+from .memory_utils import get_true_memory_usage_percent
+

 class RateLimiter:
    def __init__(
@@ -96,7 +98,7 @@ class BaseDispatcher(ABC):
        self.rate_limiter = rate_limiter
        self.monitor = monitor

-    def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> CrawlerRunConfig:
+    def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> Optional[CrawlerRunConfig]:
        """Select the appropriate config for a given URL.
        
        Args:
@@ -104,23 +106,23 @@ class BaseDispatcher(ABC):
            configs: Single config or list of configs to choose from
            
        Returns:
-            The matching config, or the first config if no match, or a default config if empty list
+            The matching config, or None if no match found
        """
        # Single config - return as is
        if isinstance(configs, CrawlerRunConfig):
            return configs
        
-        # Empty list - return default config
+        # Empty list - return None
        if not configs:
-            return CrawlerRunConfig()
+            return None
        
        # Find first matching config
        for config in configs:
            if config.is_match(url):
                return config
        
-        # No match found - return first config as fallback
-        return configs[0]
+        # No match found - return None to indicate URL should be skipped
+        return None

    @abstractmethod
    async def crawl_url(
@@ -173,7 +175,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
    async def _memory_monitor_task(self):
        """Background task to continuously monitor memory usage and update state"""
        while True:
-            self.current_memory_percent = psutil.virtual_memory().percent
+            self.current_memory_percent = get_true_memory_usage_percent()

            # Enter memory pressure mode if we cross the threshold
            if self.current_memory_percent >= self.memory_threshold_percent:
@@ -237,6 +239,34 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
        # Select appropriate config for this URL
        selected_config = self.select_config(url, config)
        
+        # If no config matches, return failed result
+        if selected_config is None:
+            error_message = f"No matching configuration found for URL: {url}"
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id, 
+                    status=CrawlStatus.FAILED,
+                    error_message=error_message
+                )
+            
+            return CrawlerTaskResult(
+                task_id=task_id,
+                url=url,
+                result=CrawlResult(
+                    url=url, 
+                    html="", 
+                    metadata={"status": "no_config_match"}, 
+                    success=False, 
+                    error_message=error_message
+                ),
+                memory_usage=0,
+                peak_memory=0,
+                start_time=start_time,
+                end_time=time.time(),
+                error_message=error_message,
+                retry_count=retry_count
+            )
+        
        # Get starting memory for accurate measurement
        process = psutil.Process()
        start_memory = process.memory_info().rss / (1024 * 1024)
@@ -611,6 +641,33 @@ class SemaphoreDispatcher(BaseDispatcher):

        # Select appropriate config for this URL
        selected_config = self.select_config(url, config)
+        
+        # If no config matches, return failed result
+        if selected_config is None:
+            error_message = f"No matching configuration found for URL: {url}"
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id, 
+                    status=CrawlStatus.FAILED,
+                    error_message=error_message
+                )
+            
+            return CrawlerTaskResult(
+                task_id=task_id,
+                url=url,
+                result=CrawlResult(
+                    url=url, 
+                    html="", 
+                    metadata={"status": "no_config_match"}, 
+                    success=False, 
+                    error_message=error_message
+                ),
+                memory_usage=0,
+                peak_memory=0,
+                start_time=start_time,
+                end_time=time.time(),
+                error_message=error_message
+            )

        try:
            if self.monitor:
--- a/crawl4ai/memory_utils.py
+++ b/crawl4ai/memory_utils.py
@@ -0,0 +1,79 @@
+import psutil
+import platform
+import subprocess
+from typing import Tuple
+
+
+def get_true_available_memory_gb() -> float:
+    """Get truly available memory including inactive pages (cross-platform)"""
+    vm = psutil.virtual_memory()
+
+    if platform.system() == 'Darwin':  # macOS
+        # On macOS, we need to include inactive memory too
+        try:
+            # Use vm_stat to get accurate values
+            result = subprocess.run(['vm_stat'], capture_output=True, text=True)
+            lines = result.stdout.split('\n')
+
+            page_size = 16384  # macOS page size
+            pages = {}
+
+            for line in lines:
+                if 'Pages free:' in line:
+                    pages['free'] = int(line.split()[-1].rstrip('.'))
+                elif 'Pages inactive:' in line:
+                    pages['inactive'] = int(line.split()[-1].rstrip('.'))
+                elif 'Pages speculative:' in line:
+                    pages['speculative'] = int(line.split()[-1].rstrip('.'))
+                elif 'Pages purgeable:' in line:
+                    pages['purgeable'] = int(line.split()[-1].rstrip('.'))
+
+            # Calculate total available (free + inactive + speculative + purgeable)
+            total_available_pages = (
+                pages.get('free', 0) + 
+                pages.get('inactive', 0) + 
+                pages.get('speculative', 0) + 
+                pages.get('purgeable', 0)
+            )
+            available_gb = (total_available_pages * page_size) / (1024**3)
+
+            return available_gb
+        except:
+            # Fallback to psutil
+            return vm.available / (1024**3)
+    else:
+        # For Windows and Linux, psutil.available is accurate
+        return vm.available / (1024**3)
+
+
+def get_true_memory_usage_percent() -> float:
+    """
+    Get memory usage percentage that accounts for platform differences.
+    
+    Returns:
+        float: Memory usage percentage (0-100)
+    """
+    vm = psutil.virtual_memory()
+    total_gb = vm.total / (1024**3)
+    available_gb = get_true_available_memory_gb()
+    
+    # Calculate used percentage based on truly available memory
+    used_percent = 100.0 * (total_gb - available_gb) / total_gb
+    
+    # Ensure it's within valid range
+    return max(0.0, min(100.0, used_percent))
+
+
+def get_memory_stats() -> Tuple[float, float, float]:
+    """
+    Get comprehensive memory statistics.
+    
+    Returns:
+        Tuple[float, float, float]: (used_percent, available_gb, total_gb)
+    """
+    vm = psutil.virtual_memory()
+    total_gb = vm.total / (1024**3)
+    available_gb = get_true_available_memory_gb()
+    used_percent = get_true_memory_usage_percent()
+    
+    return used_percent, available_gb, total_gb
--- a/docs/examples/demo_multi_config_clean.py
+++ b/docs/examples/demo_multi_config_clean.py
@@ -188,7 +188,6 @@ async def demo_part2_practical_crawling():
                lambda url: 'api' in url or 'httpbin.org' in url  # Function for API endpoints
            ],
            match_mode=MatchMode.OR,
-            extraction_strategy=JsonCssExtractionStrategy({"data": "body"})
        ),
        
        # Config 5: Complex matcher - Secure documentation sites
@@ -200,11 +199,11 @@ async def demo_part2_practical_crawling():
                lambda url: not url.endswith(('.pdf', '.json'))  # Not PDF or JSON
            ],
            match_mode=MatchMode.AND,
-            wait_for="css:.content, css:article"  # Wait for content to load
+            # wait_for="css:.content, css:article"  # Wait for content to load
        ),
        
        # Default config for everything else
-        CrawlerRunConfig()  # No url_matcher means it never matches (except as fallback)
+        # CrawlerRunConfig()  # No url_matcher means it matches everything (use it as fallback)
    ]
    
    # URLs to crawl - each will use a different config
@@ -214,7 +213,7 @@ async def demo_part2_practical_crawling():
        "https://github.com/microsoft/playwright",  # → JS config
        "https://httpbin.org/json",  # → Mixed matcher config (API)
        "https://docs.python.org/3/reference/",  # → Complex matcher config
-        "https://example.com/",  # → Default config
+        "https://www.w3schools.com/",  # → Default config, if you uncomment the default config line above, if not you will see `Error: No matching configuration`
    ]
    
    print("URLs to crawl:")
--- a/docs/md_v2/advanced/multi-url-crawling.md
+++ b/docs/md_v2/advanced/multi-url-crawling.md
@@ -447,11 +447,11 @@ async def crawl_mixed_content():
        # API endpoints - JSON extraction
        CrawlerRunConfig(
            url_matcher=lambda url: 'api' in url or url.endswith('.json'),
-            extraction_strategy=JsonCssExtractionStrategy({"data": "body"})
+            # Custome settings for JSON extraction
        ),
        
        # Default config for everything else
-        CrawlerRunConfig()  # No url_matcher = fallback
+        CrawlerRunConfig()  # No url_matcher means it matches ALL URLs (fallback)
    ]
    
    # Mixed URLs
@@ -475,6 +475,8 @@ async def crawl_mixed_content():

 ### 6.2 Advanced Pattern Matching

+**Important**: A `CrawlerRunConfig` without `url_matcher` (or with `url_matcher=None`) matches ALL URLs. This makes it perfect as a default/fallback configuration.
+
 The `url_matcher` parameter supports three types of patterns:

 #### Glob Patterns (Strings)
@@ -560,11 +562,17 @@ async def crawl_news_site():
 ### 6.4 Best Practices

 1. **Order Matters**: Configs are evaluated in order - put specific patterns before general ones
-2. **Always Include a Default**: Last config should have no `url_matcher` as a fallback
+2. **Default Config Behavior**: 
+   - A config without `url_matcher` matches ALL URLs
+   - Always include a default config as the last item if you want to handle all URLs
+   - Without a default config, unmatched URLs will fail with "No matching configuration found"
 3. **Test Your Patterns**: Use the config's `is_match()` method to test patterns:
   ```python
-   config = CrawlerRunConfig(url_matcher="*/api/*")
-   print(config.is_match("https://example.com/api/users"))  # True
+   config = CrawlerRunConfig(url_matcher="*.pdf")
+   print(config.is_match("https://example.com/doc.pdf"))  # True
+   
+   default_config = CrawlerRunConfig()  # No url_matcher
+   print(default_config.is_match("https://any-url.com"))  # True - matches everything!
   ```
 4. **Optimize for Performance**: 
   - Disable JS for static content
--- a/docs/md_v2/api/arun_many.md
+++ b/docs/md_v2/api/arun_many.md
@@ -131,7 +131,7 @@ github_config = CrawlerRunConfig(
 # API endpoints - JSON extraction
 api_config = CrawlerRunConfig(
    url_matcher=lambda url: 'api' in url or url.endswith('.json'),
-    extraction_strategy=JsonCssExtractionStrategy({"data": "body"})
+    # Custome settings for JSON extraction
 )

 # Default fallback config
@@ -160,6 +160,7 @@ results = await crawler.arun_many(
 - Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
 - `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.  
 - If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
+- **Important**: Always include a default config (without `url_matcher`) as the last item if you want to handle all URLs. Otherwise, unmatched URLs will fail.

 ### Return Value

--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -214,7 +214,7 @@ See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detaile

 | **Parameter**          | **Type / Default**           | **What It Does**                                                                                                                    |
 |------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
-| **`url_matcher`**      | `UrlMatcher` (None)          | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types                                         |
+| **`url_matcher`**      | `UrlMatcher` (None)          | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types. **None means match ALL URLs**         |
 | **`match_mode`**       | `MatchMode` (MatchMode.OR)   | How to combine multiple matchers in a list: `MatchMode.OR` (any match) or `MatchMode.AND` (all must match)                       |

 The `url_matcher` parameter enables URL-specific configurations when used with `arun_many()`:
@@ -239,7 +239,7 @@ blog_config = CrawlerRunConfig(
 # Function matcher
 api_config = CrawlerRunConfig(
    url_matcher=lambda url: 'api' in url or url.endswith('.json'),
-    extraction_strategy=JsonCssExtractionStrategy({"data": "body"})
+    # Other settings like extraction_strategy
 )

 # Mixed: String + Function with AND logic
@@ -257,14 +257,21 @@ secure_docs = CrawlerRunConfig(
    url_matcher=["https://*", lambda url: '.doc' in url],
    match_mode=MatchMode.AND  # Must be HTTPS AND contain .doc
 )
+
+# Default config - matches ALL URLs
+default_config = CrawlerRunConfig()  # No url_matcher = matches everything
 ```

 **UrlMatcher Types:**
+- **None (default)**: When `url_matcher` is None or not set, the config matches ALL URLs
 - **String patterns**: Glob-style patterns like `"*.pdf"`, `"*/api/*"`, `"https://*.example.com/*"`
 - **Functions**: `lambda url: bool` - Custom logic for complex matching
 - **Lists**: Mix strings and functions, combined with `MatchMode.OR` or `MatchMode.AND`

-When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins!
+**Important Behavior:**
+- When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins!
+- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found"
+- Always include a default config as the last item if you want to handle all URLs

 ---## 2.2 Helper Methods

--- a/tests/test_memory_macos.py
+++ b/tests/test_memory_macos.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""Test script to verify macOS memory calculation accuracy."""
+
+import psutil
+import platform
+import time
+from crawl4ai.memory_utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb
+
+
+def test_memory_calculation():
+    """Test and compare memory calculations."""
+    print(f"Platform: {platform.system()}")
+    print(f"Python version: {platform.python_version()}")
+    print("-" * 60)
+    
+    # Get psutil's view
+    vm = psutil.virtual_memory()
+    psutil_percent = vm.percent
+    psutil_available_gb = vm.available / (1024**3)
+    total_gb = vm.total / (1024**3)
+    
+    # Get our corrected view
+    true_percent = get_true_memory_usage_percent()
+    true_available_gb = get_true_available_memory_gb()
+    true_percent_calc, available_calc, total_calc = get_memory_stats()
+    
+    print("Memory Statistics Comparison:")
+    print(f"Total Memory: {total_gb:.2f} GB")
+    print()
+    
+    print("PSUtil (Standard) Calculation:")
+    print(f"  - Memory Used: {psutil_percent:.1f}%")
+    print(f"  - Available: {psutil_available_gb:.2f} GB")
+    print()
+    
+    print("Platform-Aware Calculation:")
+    print(f"  - Memory Used: {true_percent:.1f}%")
+    print(f"  - Available: {true_available_gb:.2f} GB")
+    print(f"  - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory")
+    print()
+    
+    # Show the impact on dispatcher behavior
+    print("Impact on MemoryAdaptiveDispatcher:")
+    thresholds = {
+        "Normal": 90.0,
+        "Critical": 95.0,
+        "Recovery": 85.0
+    }
+    
+    for name, threshold in thresholds.items():
+        psutil_triggered = psutil_percent >= threshold
+        true_triggered = true_percent >= threshold
+        print(f"  - {name} Threshold ({threshold}%):")
+        print(f"    PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}")
+        print(f"    Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}")
+        if psutil_triggered != true_triggered:
+            print(f"    → Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}")
+    print()
+    
+    # Monitor for a few seconds
+    print("Monitoring memory for 10 seconds...")
+    for i in range(10):
+        vm = psutil.virtual_memory()
+        true_pct = get_true_memory_usage_percent()
+        print(f"  {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r")
+        time.sleep(1)
+    print("\n")
+
+
+if __name__ == "__main__":
+    test_memory_calculation()
--- a/tests/test_multi_config.py
+++ b/tests/test_multi_config.py
@@ -55,13 +55,13 @@ async def test_multi_config():
    
    # Test URLs - using real URLs that exist
    test_urls = [
-        # "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",  # Real PDF
-        # "https://www.bbc.com/news/articles/c5y3e3glnldo",  # News article
-        # "https://blog.python.org/",  # Blog URL  
-        # "https://api.github.com/users/github",  # GitHub API (returns JSON)
-        # "https://httpbin.org/json",  # API endpoint that returns JSON
-        # "https://www.python.org/",  # Generic HTTPS page
-        # "http://info.cern.ch/",  # HTTP (not HTTPS) page
+        "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",  # Real PDF
+        "https://www.bbc.com/news/articles/c5y3e3glnldo",  # News article
+        "https://blog.python.org/",  # Blog URL  
+        "https://api.github.com/users/github",  # GitHub API (returns JSON)
+        "https://httpbin.org/json",  # API endpoint that returns JSON
+        "https://www.python.org/",  # Generic HTTPS page
+        "http://info.cern.ch/",  # HTTP (not HTTPS) page
        "https://example.com/",  # → Default config
    ]