diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 7cab8f57..62f62eea 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1343,10 +1343,10 @@ class CrawlerRunConfig(): url: The URL to check against this config's matcher Returns: - bool: True if this config should be used for the URL + bool: True if this config should be used for the URL or if no matcher is set. """ if self.url_matcher is None: - return False + return True if callable(self.url_matcher): # Single function matcher diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py index 77739616..0f3fab3d 100644 --- a/crawl4ai/async_dispatcher.py +++ b/crawl4ai/async_dispatcher.py @@ -22,6 +22,8 @@ from urllib.parse import urlparse import random from abc import ABC, abstractmethod +from .memory_utils import get_true_memory_usage_percent + class RateLimiter: def __init__( @@ -96,7 +98,7 @@ class BaseDispatcher(ABC): self.rate_limiter = rate_limiter self.monitor = monitor - def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> CrawlerRunConfig: + def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> Optional[CrawlerRunConfig]: """Select the appropriate config for a given URL. Args: @@ -104,23 +106,23 @@ class BaseDispatcher(ABC): configs: Single config or list of configs to choose from Returns: - The matching config, or the first config if no match, or a default config if empty list + The matching config, or None if no match found """ # Single config - return as is if isinstance(configs, CrawlerRunConfig): return configs - # Empty list - return default config + # Empty list - return None if not configs: - return CrawlerRunConfig() + return None # Find first matching config for config in configs: if config.is_match(url): return config - # No match found - return first config as fallback - return configs[0] + # No match found - return None to indicate URL should be skipped + return None @abstractmethod async def crawl_url( @@ -173,7 +175,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): async def _memory_monitor_task(self): """Background task to continuously monitor memory usage and update state""" while True: - self.current_memory_percent = psutil.virtual_memory().percent + self.current_memory_percent = get_true_memory_usage_percent() # Enter memory pressure mode if we cross the threshold if self.current_memory_percent >= self.memory_threshold_percent: @@ -237,6 +239,34 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): # Select appropriate config for this URL selected_config = self.select_config(url, config) + # If no config matches, return failed result + if selected_config is None: + error_message = f"No matching configuration found for URL: {url}" + if self.monitor: + self.monitor.update_task( + task_id, + status=CrawlStatus.FAILED, + error_message=error_message + ) + + return CrawlerTaskResult( + task_id=task_id, + url=url, + result=CrawlResult( + url=url, + html="", + metadata={"status": "no_config_match"}, + success=False, + error_message=error_message + ), + memory_usage=0, + peak_memory=0, + start_time=start_time, + end_time=time.time(), + error_message=error_message, + retry_count=retry_count + ) + # Get starting memory for accurate measurement process = psutil.Process() start_memory = process.memory_info().rss / (1024 * 1024) @@ -611,6 +641,33 @@ class SemaphoreDispatcher(BaseDispatcher): # Select appropriate config for this URL selected_config = self.select_config(url, config) + + # If no config matches, return failed result + if selected_config is None: + error_message = f"No matching configuration found for URL: {url}" + if self.monitor: + self.monitor.update_task( + task_id, + status=CrawlStatus.FAILED, + error_message=error_message + ) + + return CrawlerTaskResult( + task_id=task_id, + url=url, + result=CrawlResult( + url=url, + html="", + metadata={"status": "no_config_match"}, + success=False, + error_message=error_message + ), + memory_usage=0, + peak_memory=0, + start_time=start_time, + end_time=time.time(), + error_message=error_message + ) try: if self.monitor: diff --git a/crawl4ai/memory_utils.py b/crawl4ai/memory_utils.py new file mode 100644 index 00000000..fa140c93 --- /dev/null +++ b/crawl4ai/memory_utils.py @@ -0,0 +1,79 @@ +import psutil +import platform +import subprocess +from typing import Tuple + + +def get_true_available_memory_gb() -> float: + """Get truly available memory including inactive pages (cross-platform)""" + vm = psutil.virtual_memory() + + if platform.system() == 'Darwin': # macOS + # On macOS, we need to include inactive memory too + try: + # Use vm_stat to get accurate values + result = subprocess.run(['vm_stat'], capture_output=True, text=True) + lines = result.stdout.split('\n') + + page_size = 16384 # macOS page size + pages = {} + + for line in lines: + if 'Pages free:' in line: + pages['free'] = int(line.split()[-1].rstrip('.')) + elif 'Pages inactive:' in line: + pages['inactive'] = int(line.split()[-1].rstrip('.')) + elif 'Pages speculative:' in line: + pages['speculative'] = int(line.split()[-1].rstrip('.')) + elif 'Pages purgeable:' in line: + pages['purgeable'] = int(line.split()[-1].rstrip('.')) + + # Calculate total available (free + inactive + speculative + purgeable) + total_available_pages = ( + pages.get('free', 0) + + pages.get('inactive', 0) + + pages.get('speculative', 0) + + pages.get('purgeable', 0) + ) + available_gb = (total_available_pages * page_size) / (1024**3) + + return available_gb + except: + # Fallback to psutil + return vm.available / (1024**3) + else: + # For Windows and Linux, psutil.available is accurate + return vm.available / (1024**3) + + +def get_true_memory_usage_percent() -> float: + """ + Get memory usage percentage that accounts for platform differences. + + Returns: + float: Memory usage percentage (0-100) + """ + vm = psutil.virtual_memory() + total_gb = vm.total / (1024**3) + available_gb = get_true_available_memory_gb() + + # Calculate used percentage based on truly available memory + used_percent = 100.0 * (total_gb - available_gb) / total_gb + + # Ensure it's within valid range + return max(0.0, min(100.0, used_percent)) + + +def get_memory_stats() -> Tuple[float, float, float]: + """ + Get comprehensive memory statistics. + + Returns: + Tuple[float, float, float]: (used_percent, available_gb, total_gb) + """ + vm = psutil.virtual_memory() + total_gb = vm.total / (1024**3) + available_gb = get_true_available_memory_gb() + used_percent = get_true_memory_usage_percent() + + return used_percent, available_gb, total_gb \ No newline at end of file diff --git a/docs/examples/demo_multi_config_clean.py b/docs/examples/demo_multi_config_clean.py index fb3a72ed..09df71ce 100644 --- a/docs/examples/demo_multi_config_clean.py +++ b/docs/examples/demo_multi_config_clean.py @@ -188,7 +188,6 @@ async def demo_part2_practical_crawling(): lambda url: 'api' in url or 'httpbin.org' in url # Function for API endpoints ], match_mode=MatchMode.OR, - extraction_strategy=JsonCssExtractionStrategy({"data": "body"}) ), # Config 5: Complex matcher - Secure documentation sites @@ -200,11 +199,11 @@ async def demo_part2_practical_crawling(): lambda url: not url.endswith(('.pdf', '.json')) # Not PDF or JSON ], match_mode=MatchMode.AND, - wait_for="css:.content, css:article" # Wait for content to load + # wait_for="css:.content, css:article" # Wait for content to load ), # Default config for everything else - CrawlerRunConfig() # No url_matcher means it never matches (except as fallback) + # CrawlerRunConfig() # No url_matcher means it matches everything (use it as fallback) ] # URLs to crawl - each will use a different config @@ -214,7 +213,7 @@ async def demo_part2_practical_crawling(): "https://github.com/microsoft/playwright", # → JS config "https://httpbin.org/json", # → Mixed matcher config (API) "https://docs.python.org/3/reference/", # → Complex matcher config - "https://example.com/", # → Default config + "https://www.w3schools.com/", # → Default config, if you uncomment the default config line above, if not you will see `Error: No matching configuration` ] print("URLs to crawl:") diff --git a/docs/md_v2/advanced/multi-url-crawling.md b/docs/md_v2/advanced/multi-url-crawling.md index aa4241fe..2c924eff 100644 --- a/docs/md_v2/advanced/multi-url-crawling.md +++ b/docs/md_v2/advanced/multi-url-crawling.md @@ -447,11 +447,11 @@ async def crawl_mixed_content(): # API endpoints - JSON extraction CrawlerRunConfig( url_matcher=lambda url: 'api' in url or url.endswith('.json'), - extraction_strategy=JsonCssExtractionStrategy({"data": "body"}) + # Custome settings for JSON extraction ), # Default config for everything else - CrawlerRunConfig() # No url_matcher = fallback + CrawlerRunConfig() # No url_matcher means it matches ALL URLs (fallback) ] # Mixed URLs @@ -475,6 +475,8 @@ async def crawl_mixed_content(): ### 6.2 Advanced Pattern Matching +**Important**: A `CrawlerRunConfig` without `url_matcher` (or with `url_matcher=None`) matches ALL URLs. This makes it perfect as a default/fallback configuration. + The `url_matcher` parameter supports three types of patterns: #### Glob Patterns (Strings) @@ -560,11 +562,17 @@ async def crawl_news_site(): ### 6.4 Best Practices 1. **Order Matters**: Configs are evaluated in order - put specific patterns before general ones -2. **Always Include a Default**: Last config should have no `url_matcher` as a fallback +2. **Default Config Behavior**: + - A config without `url_matcher` matches ALL URLs + - Always include a default config as the last item if you want to handle all URLs + - Without a default config, unmatched URLs will fail with "No matching configuration found" 3. **Test Your Patterns**: Use the config's `is_match()` method to test patterns: ```python - config = CrawlerRunConfig(url_matcher="*/api/*") - print(config.is_match("https://example.com/api/users")) # True + config = CrawlerRunConfig(url_matcher="*.pdf") + print(config.is_match("https://example.com/doc.pdf")) # True + + default_config = CrawlerRunConfig() # No url_matcher + print(default_config.is_match("https://any-url.com")) # True - matches everything! ``` 4. **Optimize for Performance**: - Disable JS for static content diff --git a/docs/md_v2/api/arun_many.md b/docs/md_v2/api/arun_many.md index a233a0f9..146584c3 100644 --- a/docs/md_v2/api/arun_many.md +++ b/docs/md_v2/api/arun_many.md @@ -131,7 +131,7 @@ github_config = CrawlerRunConfig( # API endpoints - JSON extraction api_config = CrawlerRunConfig( url_matcher=lambda url: 'api' in url or url.endswith('.json'), - extraction_strategy=JsonCssExtractionStrategy({"data": "body"}) + # Custome settings for JSON extraction ) # Default fallback config @@ -160,6 +160,7 @@ results = await crawler.arun_many( - Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy. - `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.  - If you need to handle authentication or session IDs, pass them in each individual task or within your run config. +- **Important**: Always include a default config (without `url_matcher`) as the last item if you want to handle all URLs. Otherwise, unmatched URLs will fail. ### Return Value diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index b6b4e402..ba526fb7 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -214,7 +214,7 @@ See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detaile | **Parameter** | **Type / Default** | **What It Does** | |------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| -| **`url_matcher`** | `UrlMatcher` (None) | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types | +| **`url_matcher`** | `UrlMatcher` (None) | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types. **None means match ALL URLs** | | **`match_mode`** | `MatchMode` (MatchMode.OR) | How to combine multiple matchers in a list: `MatchMode.OR` (any match) or `MatchMode.AND` (all must match) | The `url_matcher` parameter enables URL-specific configurations when used with `arun_many()`: @@ -239,7 +239,7 @@ blog_config = CrawlerRunConfig( # Function matcher api_config = CrawlerRunConfig( url_matcher=lambda url: 'api' in url or url.endswith('.json'), - extraction_strategy=JsonCssExtractionStrategy({"data": "body"}) + # Other settings like extraction_strategy ) # Mixed: String + Function with AND logic @@ -257,14 +257,21 @@ secure_docs = CrawlerRunConfig( url_matcher=["https://*", lambda url: '.doc' in url], match_mode=MatchMode.AND # Must be HTTPS AND contain .doc ) + +# Default config - matches ALL URLs +default_config = CrawlerRunConfig() # No url_matcher = matches everything ``` **UrlMatcher Types:** +- **None (default)**: When `url_matcher` is None or not set, the config matches ALL URLs - **String patterns**: Glob-style patterns like `"*.pdf"`, `"*/api/*"`, `"https://*.example.com/*"` - **Functions**: `lambda url: bool` - Custom logic for complex matching - **Lists**: Mix strings and functions, combined with `MatchMode.OR` or `MatchMode.AND` -When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins! +**Important Behavior:** +- When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins! +- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found" +- Always include a default config as the last item if you want to handle all URLs ---## 2.2 Helper Methods diff --git a/tests/test_memory_macos.py b/tests/test_memory_macos.py new file mode 100755 index 00000000..b94d8a8b --- /dev/null +++ b/tests/test_memory_macos.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +"""Test script to verify macOS memory calculation accuracy.""" + +import psutil +import platform +import time +from crawl4ai.memory_utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb + + +def test_memory_calculation(): + """Test and compare memory calculations.""" + print(f"Platform: {platform.system()}") + print(f"Python version: {platform.python_version()}") + print("-" * 60) + + # Get psutil's view + vm = psutil.virtual_memory() + psutil_percent = vm.percent + psutil_available_gb = vm.available / (1024**3) + total_gb = vm.total / (1024**3) + + # Get our corrected view + true_percent = get_true_memory_usage_percent() + true_available_gb = get_true_available_memory_gb() + true_percent_calc, available_calc, total_calc = get_memory_stats() + + print("Memory Statistics Comparison:") + print(f"Total Memory: {total_gb:.2f} GB") + print() + + print("PSUtil (Standard) Calculation:") + print(f" - Memory Used: {psutil_percent:.1f}%") + print(f" - Available: {psutil_available_gb:.2f} GB") + print() + + print("Platform-Aware Calculation:") + print(f" - Memory Used: {true_percent:.1f}%") + print(f" - Available: {true_available_gb:.2f} GB") + print(f" - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory") + print() + + # Show the impact on dispatcher behavior + print("Impact on MemoryAdaptiveDispatcher:") + thresholds = { + "Normal": 90.0, + "Critical": 95.0, + "Recovery": 85.0 + } + + for name, threshold in thresholds.items(): + psutil_triggered = psutil_percent >= threshold + true_triggered = true_percent >= threshold + print(f" - {name} Threshold ({threshold}%):") + print(f" PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}") + print(f" Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}") + if psutil_triggered != true_triggered: + print(f" → Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}") + print() + + # Monitor for a few seconds + print("Monitoring memory for 10 seconds...") + for i in range(10): + vm = psutil.virtual_memory() + true_pct = get_true_memory_usage_percent() + print(f" {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r") + time.sleep(1) + print("\n") + + +if __name__ == "__main__": + test_memory_calculation() \ No newline at end of file diff --git a/tests/test_multi_config.py b/tests/test_multi_config.py index ff8a6b17..09dd5283 100644 --- a/tests/test_multi_config.py +++ b/tests/test_multi_config.py @@ -55,13 +55,13 @@ async def test_multi_config(): # Test URLs - using real URLs that exist test_urls = [ - # "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF - # "https://www.bbc.com/news/articles/c5y3e3glnldo", # News article - # "https://blog.python.org/", # Blog URL - # "https://api.github.com/users/github", # GitHub API (returns JSON) - # "https://httpbin.org/json", # API endpoint that returns JSON - # "https://www.python.org/", # Generic HTTPS page - # "http://info.cern.ch/", # HTTP (not HTTPS) page + "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF + "https://www.bbc.com/news/articles/c5y3e3glnldo", # News article + "https://blog.python.org/", # Blog URL + "https://api.github.com/users/github", # GitHub API (returns JSON) + "https://httpbin.org/json", # API endpoint that returns JSON + "https://www.python.org/", # Generic HTTPS page + "http://info.cern.ch/", # HTTP (not HTTPS) page "https://example.com/", # → Default config ]