fix: Correct URL matcher fallback behavior and improve memory monitoring

Fix critical issue where unmatched URLs incorrectly used the first config instead of failing safely. Also clarify that configs without url_matcher match ALL URLs by design, and improve memory usage monitoring. Bug fixes: - Change select_config() to return None when no config matches instead of using first config - Add proper error handling in dispatchers when no config matches a URL - Return failed CrawlResult with "No matching configuration found" error message - Fix is_match() to return True when url_matcher is None (matches all URLs) - Import and use get_true_memory_usage_percent() for more accurate memory monitoring Behavior clarification: - CrawlerRunConfig with url_matcher=None matches ALL URLs (not nothing) - This is the intended behavior for default/fallback configurations - Enables clean pattern: specific configs first, default config last Documentation updates: - Clarify that configs without url_matcher match everything - Explain "No matching configuration found" error when no default config - Add examples showing proper default config usage - Update all relevant docs: multi-url-crawling.md, arun_many.md, parameters.md - Simplify API config examples by removing extraction_strategy Demo and test updates: - Update demo_multi_config_clean.py with commented default config to show behavior - Change example URL to w3schools.com to demonstrate no-match scenario - Uncomment all test URLs in test_multi_config.py for comprehensive testing Breaking changes: None - this restores the intended behavior This ensures URLs only get processed with appropriate configs, preventing issues like HTML pages being processed with PDF extraction strategies.
2025-08-03 16:50:54 +08:00
parent a03e68fa2f
commit 307fe28b32
9 changed files with 251 additions and 29 deletions
--- a/tests/test_memory_macos.py
+++ b/tests/test_memory_macos.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""Test script to verify macOS memory calculation accuracy."""
+
+import psutil
+import platform
+import time
+from crawl4ai.memory_utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb
+
+
+def test_memory_calculation():
+    """Test and compare memory calculations."""
+    print(f"Platform: {platform.system()}")
+    print(f"Python version: {platform.python_version()}")
+    print("-" * 60)
+    
+    # Get psutil's view
+    vm = psutil.virtual_memory()
+    psutil_percent = vm.percent
+    psutil_available_gb = vm.available / (1024**3)
+    total_gb = vm.total / (1024**3)
+    
+    # Get our corrected view
+    true_percent = get_true_memory_usage_percent()
+    true_available_gb = get_true_available_memory_gb()
+    true_percent_calc, available_calc, total_calc = get_memory_stats()
+    
+    print("Memory Statistics Comparison:")
+    print(f"Total Memory: {total_gb:.2f} GB")
+    print()
+    
+    print("PSUtil (Standard) Calculation:")
+    print(f"  - Memory Used: {psutil_percent:.1f}%")
+    print(f"  - Available: {psutil_available_gb:.2f} GB")
+    print()
+    
+    print("Platform-Aware Calculation:")
+    print(f"  - Memory Used: {true_percent:.1f}%")
+    print(f"  - Available: {true_available_gb:.2f} GB")
+    print(f"  - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory")
+    print()
+    
+    # Show the impact on dispatcher behavior
+    print("Impact on MemoryAdaptiveDispatcher:")
+    thresholds = {
+        "Normal": 90.0,
+        "Critical": 95.0,
+        "Recovery": 85.0
+    }
+    
+    for name, threshold in thresholds.items():
+        psutil_triggered = psutil_percent >= threshold
+        true_triggered = true_percent >= threshold
+        print(f"  - {name} Threshold ({threshold}%):")
+        print(f"    PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}")
+        print(f"    Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}")
+        if psutil_triggered != true_triggered:
+            print(f"    → Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}")
+    print()
+    
+    # Monitor for a few seconds
+    print("Monitoring memory for 10 seconds...")
+    for i in range(10):
+        vm = psutil.virtual_memory()
+        true_pct = get_true_memory_usage_percent()
+        print(f"  {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r")
+        time.sleep(1)
+    print("\n")
+
+
+if __name__ == "__main__":
+    test_memory_calculation()