Fix critical issue where unmatched URLs incorrectly used the first config instead of failing safely. Also clarify that configs without url_matcher match ALL URLs by design, and improve memory usage monitoring. Bug fixes: - Change select_config() to return None when no config matches instead of using first config - Add proper error handling in dispatchers when no config matches a URL - Return failed CrawlResult with "No matching configuration found" error message - Fix is_match() to return True when url_matcher is None (matches all URLs) - Import and use get_true_memory_usage_percent() for more accurate memory monitoring Behavior clarification: - CrawlerRunConfig with url_matcher=None matches ALL URLs (not nothing) - This is the intended behavior for default/fallback configurations - Enables clean pattern: specific configs first, default config last Documentation updates: - Clarify that configs without url_matcher match everything - Explain "No matching configuration found" error when no default config - Add examples showing proper default config usage - Update all relevant docs: multi-url-crawling.md, arun_many.md, parameters.md - Simplify API config examples by removing extraction_strategy Demo and test updates: - Update demo_multi_config_clean.py with commented default config to show behavior - Change example URL to w3schools.com to demonstrate no-match scenario - Uncomment all test URLs in test_multi_config.py for comprehensive testing Breaking changes: None - this restores the intended behavior This ensures URLs only get processed with appropriate configs, preventing issues like HTML pages being processed with PDF extraction strategies.
71 lines
2.5 KiB
Python
Executable File
71 lines
2.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Test script to verify macOS memory calculation accuracy."""
|
|
|
|
import psutil
|
|
import platform
|
|
import time
|
|
from crawl4ai.memory_utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb
|
|
|
|
|
|
def test_memory_calculation():
|
|
"""Test and compare memory calculations."""
|
|
print(f"Platform: {platform.system()}")
|
|
print(f"Python version: {platform.python_version()}")
|
|
print("-" * 60)
|
|
|
|
# Get psutil's view
|
|
vm = psutil.virtual_memory()
|
|
psutil_percent = vm.percent
|
|
psutil_available_gb = vm.available / (1024**3)
|
|
total_gb = vm.total / (1024**3)
|
|
|
|
# Get our corrected view
|
|
true_percent = get_true_memory_usage_percent()
|
|
true_available_gb = get_true_available_memory_gb()
|
|
true_percent_calc, available_calc, total_calc = get_memory_stats()
|
|
|
|
print("Memory Statistics Comparison:")
|
|
print(f"Total Memory: {total_gb:.2f} GB")
|
|
print()
|
|
|
|
print("PSUtil (Standard) Calculation:")
|
|
print(f" - Memory Used: {psutil_percent:.1f}%")
|
|
print(f" - Available: {psutil_available_gb:.2f} GB")
|
|
print()
|
|
|
|
print("Platform-Aware Calculation:")
|
|
print(f" - Memory Used: {true_percent:.1f}%")
|
|
print(f" - Available: {true_available_gb:.2f} GB")
|
|
print(f" - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory")
|
|
print()
|
|
|
|
# Show the impact on dispatcher behavior
|
|
print("Impact on MemoryAdaptiveDispatcher:")
|
|
thresholds = {
|
|
"Normal": 90.0,
|
|
"Critical": 95.0,
|
|
"Recovery": 85.0
|
|
}
|
|
|
|
for name, threshold in thresholds.items():
|
|
psutil_triggered = psutil_percent >= threshold
|
|
true_triggered = true_percent >= threshold
|
|
print(f" - {name} Threshold ({threshold}%):")
|
|
print(f" PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}")
|
|
print(f" Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}")
|
|
if psutil_triggered != true_triggered:
|
|
print(f" → Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}")
|
|
print()
|
|
|
|
# Monitor for a few seconds
|
|
print("Monitoring memory for 10 seconds...")
|
|
for i in range(10):
|
|
vm = psutil.virtual_memory()
|
|
true_pct = get_true_memory_usage_percent()
|
|
print(f" {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r")
|
|
time.sleep(1)
|
|
print("\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_memory_calculation() |