fix: Correct URL matcher fallback behavior and improve memory monitoring
Fix critical issue where unmatched URLs incorrectly used the first config instead of failing safely. Also clarify that configs without url_matcher match ALL URLs by design, and improve memory usage monitoring. Bug fixes: - Change select_config() to return None when no config matches instead of using first config - Add proper error handling in dispatchers when no config matches a URL - Return failed CrawlResult with "No matching configuration found" error message - Fix is_match() to return True when url_matcher is None (matches all URLs) - Import and use get_true_memory_usage_percent() for more accurate memory monitoring Behavior clarification: - CrawlerRunConfig with url_matcher=None matches ALL URLs (not nothing) - This is the intended behavior for default/fallback configurations - Enables clean pattern: specific configs first, default config last Documentation updates: - Clarify that configs without url_matcher match everything - Explain "No matching configuration found" error when no default config - Add examples showing proper default config usage - Update all relevant docs: multi-url-crawling.md, arun_many.md, parameters.md - Simplify API config examples by removing extraction_strategy Demo and test updates: - Update demo_multi_config_clean.py with commented default config to show behavior - Change example URL to w3schools.com to demonstrate no-match scenario - Uncomment all test URLs in test_multi_config.py for comprehensive testing Breaking changes: None - this restores the intended behavior This ensures URLs only get processed with appropriate configs, preventing issues like HTML pages being processed with PDF extraction strategies.
This commit is contained in:
@@ -1343,10 +1343,10 @@ class CrawlerRunConfig():
|
|||||||
url: The URL to check against this config's matcher
|
url: The URL to check against this config's matcher
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if this config should be used for the URL
|
bool: True if this config should be used for the URL or if no matcher is set.
|
||||||
"""
|
"""
|
||||||
if self.url_matcher is None:
|
if self.url_matcher is None:
|
||||||
return False
|
return True
|
||||||
|
|
||||||
if callable(self.url_matcher):
|
if callable(self.url_matcher):
|
||||||
# Single function matcher
|
# Single function matcher
|
||||||
|
|||||||
@@ -22,6 +22,8 @@ from urllib.parse import urlparse
|
|||||||
import random
|
import random
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
from .memory_utils import get_true_memory_usage_percent
|
||||||
|
|
||||||
|
|
||||||
class RateLimiter:
|
class RateLimiter:
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -96,7 +98,7 @@ class BaseDispatcher(ABC):
|
|||||||
self.rate_limiter = rate_limiter
|
self.rate_limiter = rate_limiter
|
||||||
self.monitor = monitor
|
self.monitor = monitor
|
||||||
|
|
||||||
def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> CrawlerRunConfig:
|
def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> Optional[CrawlerRunConfig]:
|
||||||
"""Select the appropriate config for a given URL.
|
"""Select the appropriate config for a given URL.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -104,23 +106,23 @@ class BaseDispatcher(ABC):
|
|||||||
configs: Single config or list of configs to choose from
|
configs: Single config or list of configs to choose from
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The matching config, or the first config if no match, or a default config if empty list
|
The matching config, or None if no match found
|
||||||
"""
|
"""
|
||||||
# Single config - return as is
|
# Single config - return as is
|
||||||
if isinstance(configs, CrawlerRunConfig):
|
if isinstance(configs, CrawlerRunConfig):
|
||||||
return configs
|
return configs
|
||||||
|
|
||||||
# Empty list - return default config
|
# Empty list - return None
|
||||||
if not configs:
|
if not configs:
|
||||||
return CrawlerRunConfig()
|
return None
|
||||||
|
|
||||||
# Find first matching config
|
# Find first matching config
|
||||||
for config in configs:
|
for config in configs:
|
||||||
if config.is_match(url):
|
if config.is_match(url):
|
||||||
return config
|
return config
|
||||||
|
|
||||||
# No match found - return first config as fallback
|
# No match found - return None to indicate URL should be skipped
|
||||||
return configs[0]
|
return None
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def crawl_url(
|
async def crawl_url(
|
||||||
@@ -173,7 +175,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
|||||||
async def _memory_monitor_task(self):
|
async def _memory_monitor_task(self):
|
||||||
"""Background task to continuously monitor memory usage and update state"""
|
"""Background task to continuously monitor memory usage and update state"""
|
||||||
while True:
|
while True:
|
||||||
self.current_memory_percent = psutil.virtual_memory().percent
|
self.current_memory_percent = get_true_memory_usage_percent()
|
||||||
|
|
||||||
# Enter memory pressure mode if we cross the threshold
|
# Enter memory pressure mode if we cross the threshold
|
||||||
if self.current_memory_percent >= self.memory_threshold_percent:
|
if self.current_memory_percent >= self.memory_threshold_percent:
|
||||||
@@ -237,6 +239,34 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
|||||||
# Select appropriate config for this URL
|
# Select appropriate config for this URL
|
||||||
selected_config = self.select_config(url, config)
|
selected_config = self.select_config(url, config)
|
||||||
|
|
||||||
|
# If no config matches, return failed result
|
||||||
|
if selected_config is None:
|
||||||
|
error_message = f"No matching configuration found for URL: {url}"
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(
|
||||||
|
task_id,
|
||||||
|
status=CrawlStatus.FAILED,
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
return CrawlerTaskResult(
|
||||||
|
task_id=task_id,
|
||||||
|
url=url,
|
||||||
|
result=CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html="",
|
||||||
|
metadata={"status": "no_config_match"},
|
||||||
|
success=False,
|
||||||
|
error_message=error_message
|
||||||
|
),
|
||||||
|
memory_usage=0,
|
||||||
|
peak_memory=0,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=time.time(),
|
||||||
|
error_message=error_message,
|
||||||
|
retry_count=retry_count
|
||||||
|
)
|
||||||
|
|
||||||
# Get starting memory for accurate measurement
|
# Get starting memory for accurate measurement
|
||||||
process = psutil.Process()
|
process = psutil.Process()
|
||||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
@@ -611,6 +641,33 @@ class SemaphoreDispatcher(BaseDispatcher):
|
|||||||
|
|
||||||
# Select appropriate config for this URL
|
# Select appropriate config for this URL
|
||||||
selected_config = self.select_config(url, config)
|
selected_config = self.select_config(url, config)
|
||||||
|
|
||||||
|
# If no config matches, return failed result
|
||||||
|
if selected_config is None:
|
||||||
|
error_message = f"No matching configuration found for URL: {url}"
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(
|
||||||
|
task_id,
|
||||||
|
status=CrawlStatus.FAILED,
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
return CrawlerTaskResult(
|
||||||
|
task_id=task_id,
|
||||||
|
url=url,
|
||||||
|
result=CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html="",
|
||||||
|
metadata={"status": "no_config_match"},
|
||||||
|
success=False,
|
||||||
|
error_message=error_message
|
||||||
|
),
|
||||||
|
memory_usage=0,
|
||||||
|
peak_memory=0,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=time.time(),
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if self.monitor:
|
if self.monitor:
|
||||||
|
|||||||
79
crawl4ai/memory_utils.py
Normal file
79
crawl4ai/memory_utils.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
import psutil
|
||||||
|
import platform
|
||||||
|
import subprocess
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def get_true_available_memory_gb() -> float:
|
||||||
|
"""Get truly available memory including inactive pages (cross-platform)"""
|
||||||
|
vm = psutil.virtual_memory()
|
||||||
|
|
||||||
|
if platform.system() == 'Darwin': # macOS
|
||||||
|
# On macOS, we need to include inactive memory too
|
||||||
|
try:
|
||||||
|
# Use vm_stat to get accurate values
|
||||||
|
result = subprocess.run(['vm_stat'], capture_output=True, text=True)
|
||||||
|
lines = result.stdout.split('\n')
|
||||||
|
|
||||||
|
page_size = 16384 # macOS page size
|
||||||
|
pages = {}
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if 'Pages free:' in line:
|
||||||
|
pages['free'] = int(line.split()[-1].rstrip('.'))
|
||||||
|
elif 'Pages inactive:' in line:
|
||||||
|
pages['inactive'] = int(line.split()[-1].rstrip('.'))
|
||||||
|
elif 'Pages speculative:' in line:
|
||||||
|
pages['speculative'] = int(line.split()[-1].rstrip('.'))
|
||||||
|
elif 'Pages purgeable:' in line:
|
||||||
|
pages['purgeable'] = int(line.split()[-1].rstrip('.'))
|
||||||
|
|
||||||
|
# Calculate total available (free + inactive + speculative + purgeable)
|
||||||
|
total_available_pages = (
|
||||||
|
pages.get('free', 0) +
|
||||||
|
pages.get('inactive', 0) +
|
||||||
|
pages.get('speculative', 0) +
|
||||||
|
pages.get('purgeable', 0)
|
||||||
|
)
|
||||||
|
available_gb = (total_available_pages * page_size) / (1024**3)
|
||||||
|
|
||||||
|
return available_gb
|
||||||
|
except:
|
||||||
|
# Fallback to psutil
|
||||||
|
return vm.available / (1024**3)
|
||||||
|
else:
|
||||||
|
# For Windows and Linux, psutil.available is accurate
|
||||||
|
return vm.available / (1024**3)
|
||||||
|
|
||||||
|
|
||||||
|
def get_true_memory_usage_percent() -> float:
|
||||||
|
"""
|
||||||
|
Get memory usage percentage that accounts for platform differences.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Memory usage percentage (0-100)
|
||||||
|
"""
|
||||||
|
vm = psutil.virtual_memory()
|
||||||
|
total_gb = vm.total / (1024**3)
|
||||||
|
available_gb = get_true_available_memory_gb()
|
||||||
|
|
||||||
|
# Calculate used percentage based on truly available memory
|
||||||
|
used_percent = 100.0 * (total_gb - available_gb) / total_gb
|
||||||
|
|
||||||
|
# Ensure it's within valid range
|
||||||
|
return max(0.0, min(100.0, used_percent))
|
||||||
|
|
||||||
|
|
||||||
|
def get_memory_stats() -> Tuple[float, float, float]:
|
||||||
|
"""
|
||||||
|
Get comprehensive memory statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[float, float, float]: (used_percent, available_gb, total_gb)
|
||||||
|
"""
|
||||||
|
vm = psutil.virtual_memory()
|
||||||
|
total_gb = vm.total / (1024**3)
|
||||||
|
available_gb = get_true_available_memory_gb()
|
||||||
|
used_percent = get_true_memory_usage_percent()
|
||||||
|
|
||||||
|
return used_percent, available_gb, total_gb
|
||||||
@@ -188,7 +188,6 @@ async def demo_part2_practical_crawling():
|
|||||||
lambda url: 'api' in url or 'httpbin.org' in url # Function for API endpoints
|
lambda url: 'api' in url or 'httpbin.org' in url # Function for API endpoints
|
||||||
],
|
],
|
||||||
match_mode=MatchMode.OR,
|
match_mode=MatchMode.OR,
|
||||||
extraction_strategy=JsonCssExtractionStrategy({"data": "body"})
|
|
||||||
),
|
),
|
||||||
|
|
||||||
# Config 5: Complex matcher - Secure documentation sites
|
# Config 5: Complex matcher - Secure documentation sites
|
||||||
@@ -200,11 +199,11 @@ async def demo_part2_practical_crawling():
|
|||||||
lambda url: not url.endswith(('.pdf', '.json')) # Not PDF or JSON
|
lambda url: not url.endswith(('.pdf', '.json')) # Not PDF or JSON
|
||||||
],
|
],
|
||||||
match_mode=MatchMode.AND,
|
match_mode=MatchMode.AND,
|
||||||
wait_for="css:.content, css:article" # Wait for content to load
|
# wait_for="css:.content, css:article" # Wait for content to load
|
||||||
),
|
),
|
||||||
|
|
||||||
# Default config for everything else
|
# Default config for everything else
|
||||||
CrawlerRunConfig() # No url_matcher means it never matches (except as fallback)
|
# CrawlerRunConfig() # No url_matcher means it matches everything (use it as fallback)
|
||||||
]
|
]
|
||||||
|
|
||||||
# URLs to crawl - each will use a different config
|
# URLs to crawl - each will use a different config
|
||||||
@@ -214,7 +213,7 @@ async def demo_part2_practical_crawling():
|
|||||||
"https://github.com/microsoft/playwright", # → JS config
|
"https://github.com/microsoft/playwright", # → JS config
|
||||||
"https://httpbin.org/json", # → Mixed matcher config (API)
|
"https://httpbin.org/json", # → Mixed matcher config (API)
|
||||||
"https://docs.python.org/3/reference/", # → Complex matcher config
|
"https://docs.python.org/3/reference/", # → Complex matcher config
|
||||||
"https://example.com/", # → Default config
|
"https://www.w3schools.com/", # → Default config, if you uncomment the default config line above, if not you will see `Error: No matching configuration`
|
||||||
]
|
]
|
||||||
|
|
||||||
print("URLs to crawl:")
|
print("URLs to crawl:")
|
||||||
|
|||||||
@@ -447,11 +447,11 @@ async def crawl_mixed_content():
|
|||||||
# API endpoints - JSON extraction
|
# API endpoints - JSON extraction
|
||||||
CrawlerRunConfig(
|
CrawlerRunConfig(
|
||||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||||
extraction_strategy=JsonCssExtractionStrategy({"data": "body"})
|
# Custome settings for JSON extraction
|
||||||
),
|
),
|
||||||
|
|
||||||
# Default config for everything else
|
# Default config for everything else
|
||||||
CrawlerRunConfig() # No url_matcher = fallback
|
CrawlerRunConfig() # No url_matcher means it matches ALL URLs (fallback)
|
||||||
]
|
]
|
||||||
|
|
||||||
# Mixed URLs
|
# Mixed URLs
|
||||||
@@ -475,6 +475,8 @@ async def crawl_mixed_content():
|
|||||||
|
|
||||||
### 6.2 Advanced Pattern Matching
|
### 6.2 Advanced Pattern Matching
|
||||||
|
|
||||||
|
**Important**: A `CrawlerRunConfig` without `url_matcher` (or with `url_matcher=None`) matches ALL URLs. This makes it perfect as a default/fallback configuration.
|
||||||
|
|
||||||
The `url_matcher` parameter supports three types of patterns:
|
The `url_matcher` parameter supports three types of patterns:
|
||||||
|
|
||||||
#### Glob Patterns (Strings)
|
#### Glob Patterns (Strings)
|
||||||
@@ -560,11 +562,17 @@ async def crawl_news_site():
|
|||||||
### 6.4 Best Practices
|
### 6.4 Best Practices
|
||||||
|
|
||||||
1. **Order Matters**: Configs are evaluated in order - put specific patterns before general ones
|
1. **Order Matters**: Configs are evaluated in order - put specific patterns before general ones
|
||||||
2. **Always Include a Default**: Last config should have no `url_matcher` as a fallback
|
2. **Default Config Behavior**:
|
||||||
|
- A config without `url_matcher` matches ALL URLs
|
||||||
|
- Always include a default config as the last item if you want to handle all URLs
|
||||||
|
- Without a default config, unmatched URLs will fail with "No matching configuration found"
|
||||||
3. **Test Your Patterns**: Use the config's `is_match()` method to test patterns:
|
3. **Test Your Patterns**: Use the config's `is_match()` method to test patterns:
|
||||||
```python
|
```python
|
||||||
config = CrawlerRunConfig(url_matcher="*/api/*")
|
config = CrawlerRunConfig(url_matcher="*.pdf")
|
||||||
print(config.is_match("https://example.com/api/users")) # True
|
print(config.is_match("https://example.com/doc.pdf")) # True
|
||||||
|
|
||||||
|
default_config = CrawlerRunConfig() # No url_matcher
|
||||||
|
print(default_config.is_match("https://any-url.com")) # True - matches everything!
|
||||||
```
|
```
|
||||||
4. **Optimize for Performance**:
|
4. **Optimize for Performance**:
|
||||||
- Disable JS for static content
|
- Disable JS for static content
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ github_config = CrawlerRunConfig(
|
|||||||
# API endpoints - JSON extraction
|
# API endpoints - JSON extraction
|
||||||
api_config = CrawlerRunConfig(
|
api_config = CrawlerRunConfig(
|
||||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||||
extraction_strategy=JsonCssExtractionStrategy({"data": "body"})
|
# Custome settings for JSON extraction
|
||||||
)
|
)
|
||||||
|
|
||||||
# Default fallback config
|
# Default fallback config
|
||||||
@@ -160,6 +160,7 @@ results = await crawler.arun_many(
|
|||||||
- Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
|
- Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
|
||||||
- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.
|
- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.
|
||||||
- If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
|
- If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
|
||||||
|
- **Important**: Always include a default config (without `url_matcher`) as the last item if you want to handle all URLs. Otherwise, unmatched URLs will fail.
|
||||||
|
|
||||||
### Return Value
|
### Return Value
|
||||||
|
|
||||||
|
|||||||
@@ -214,7 +214,7 @@ See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detaile
|
|||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| **`url_matcher`** | `UrlMatcher` (None) | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types |
|
| **`url_matcher`** | `UrlMatcher` (None) | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types. **None means match ALL URLs** |
|
||||||
| **`match_mode`** | `MatchMode` (MatchMode.OR) | How to combine multiple matchers in a list: `MatchMode.OR` (any match) or `MatchMode.AND` (all must match) |
|
| **`match_mode`** | `MatchMode` (MatchMode.OR) | How to combine multiple matchers in a list: `MatchMode.OR` (any match) or `MatchMode.AND` (all must match) |
|
||||||
|
|
||||||
The `url_matcher` parameter enables URL-specific configurations when used with `arun_many()`:
|
The `url_matcher` parameter enables URL-specific configurations when used with `arun_many()`:
|
||||||
@@ -239,7 +239,7 @@ blog_config = CrawlerRunConfig(
|
|||||||
# Function matcher
|
# Function matcher
|
||||||
api_config = CrawlerRunConfig(
|
api_config = CrawlerRunConfig(
|
||||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||||
extraction_strategy=JsonCssExtractionStrategy({"data": "body"})
|
# Other settings like extraction_strategy
|
||||||
)
|
)
|
||||||
|
|
||||||
# Mixed: String + Function with AND logic
|
# Mixed: String + Function with AND logic
|
||||||
@@ -257,14 +257,21 @@ secure_docs = CrawlerRunConfig(
|
|||||||
url_matcher=["https://*", lambda url: '.doc' in url],
|
url_matcher=["https://*", lambda url: '.doc' in url],
|
||||||
match_mode=MatchMode.AND # Must be HTTPS AND contain .doc
|
match_mode=MatchMode.AND # Must be HTTPS AND contain .doc
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Default config - matches ALL URLs
|
||||||
|
default_config = CrawlerRunConfig() # No url_matcher = matches everything
|
||||||
```
|
```
|
||||||
|
|
||||||
**UrlMatcher Types:**
|
**UrlMatcher Types:**
|
||||||
|
- **None (default)**: When `url_matcher` is None or not set, the config matches ALL URLs
|
||||||
- **String patterns**: Glob-style patterns like `"*.pdf"`, `"*/api/*"`, `"https://*.example.com/*"`
|
- **String patterns**: Glob-style patterns like `"*.pdf"`, `"*/api/*"`, `"https://*.example.com/*"`
|
||||||
- **Functions**: `lambda url: bool` - Custom logic for complex matching
|
- **Functions**: `lambda url: bool` - Custom logic for complex matching
|
||||||
- **Lists**: Mix strings and functions, combined with `MatchMode.OR` or `MatchMode.AND`
|
- **Lists**: Mix strings and functions, combined with `MatchMode.OR` or `MatchMode.AND`
|
||||||
|
|
||||||
When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins!
|
**Important Behavior:**
|
||||||
|
- When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins!
|
||||||
|
- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found"
|
||||||
|
- Always include a default config as the last item if you want to handle all URLs
|
||||||
|
|
||||||
---## 2.2 Helper Methods
|
---## 2.2 Helper Methods
|
||||||
|
|
||||||
|
|||||||
71
tests/test_memory_macos.py
Executable file
71
tests/test_memory_macos.py
Executable file
@@ -0,0 +1,71 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Test script to verify macOS memory calculation accuracy."""
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
import platform
|
||||||
|
import time
|
||||||
|
from crawl4ai.memory_utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb
|
||||||
|
|
||||||
|
|
||||||
|
def test_memory_calculation():
|
||||||
|
"""Test and compare memory calculations."""
|
||||||
|
print(f"Platform: {platform.system()}")
|
||||||
|
print(f"Python version: {platform.python_version()}")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
# Get psutil's view
|
||||||
|
vm = psutil.virtual_memory()
|
||||||
|
psutil_percent = vm.percent
|
||||||
|
psutil_available_gb = vm.available / (1024**3)
|
||||||
|
total_gb = vm.total / (1024**3)
|
||||||
|
|
||||||
|
# Get our corrected view
|
||||||
|
true_percent = get_true_memory_usage_percent()
|
||||||
|
true_available_gb = get_true_available_memory_gb()
|
||||||
|
true_percent_calc, available_calc, total_calc = get_memory_stats()
|
||||||
|
|
||||||
|
print("Memory Statistics Comparison:")
|
||||||
|
print(f"Total Memory: {total_gb:.2f} GB")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("PSUtil (Standard) Calculation:")
|
||||||
|
print(f" - Memory Used: {psutil_percent:.1f}%")
|
||||||
|
print(f" - Available: {psutil_available_gb:.2f} GB")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("Platform-Aware Calculation:")
|
||||||
|
print(f" - Memory Used: {true_percent:.1f}%")
|
||||||
|
print(f" - Available: {true_available_gb:.2f} GB")
|
||||||
|
print(f" - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Show the impact on dispatcher behavior
|
||||||
|
print("Impact on MemoryAdaptiveDispatcher:")
|
||||||
|
thresholds = {
|
||||||
|
"Normal": 90.0,
|
||||||
|
"Critical": 95.0,
|
||||||
|
"Recovery": 85.0
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, threshold in thresholds.items():
|
||||||
|
psutil_triggered = psutil_percent >= threshold
|
||||||
|
true_triggered = true_percent >= threshold
|
||||||
|
print(f" - {name} Threshold ({threshold}%):")
|
||||||
|
print(f" PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}")
|
||||||
|
print(f" Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}")
|
||||||
|
if psutil_triggered != true_triggered:
|
||||||
|
print(f" → Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Monitor for a few seconds
|
||||||
|
print("Monitoring memory for 10 seconds...")
|
||||||
|
for i in range(10):
|
||||||
|
vm = psutil.virtual_memory()
|
||||||
|
true_pct = get_true_memory_usage_percent()
|
||||||
|
print(f" {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r")
|
||||||
|
time.sleep(1)
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_memory_calculation()
|
||||||
@@ -55,13 +55,13 @@ async def test_multi_config():
|
|||||||
|
|
||||||
# Test URLs - using real URLs that exist
|
# Test URLs - using real URLs that exist
|
||||||
test_urls = [
|
test_urls = [
|
||||||
# "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF
|
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF
|
||||||
# "https://www.bbc.com/news/articles/c5y3e3glnldo", # News article
|
"https://www.bbc.com/news/articles/c5y3e3glnldo", # News article
|
||||||
# "https://blog.python.org/", # Blog URL
|
"https://blog.python.org/", # Blog URL
|
||||||
# "https://api.github.com/users/github", # GitHub API (returns JSON)
|
"https://api.github.com/users/github", # GitHub API (returns JSON)
|
||||||
# "https://httpbin.org/json", # API endpoint that returns JSON
|
"https://httpbin.org/json", # API endpoint that returns JSON
|
||||||
# "https://www.python.org/", # Generic HTTPS page
|
"https://www.python.org/", # Generic HTTPS page
|
||||||
# "http://info.cern.ch/", # HTTP (not HTTPS) page
|
"http://info.cern.ch/", # HTTP (not HTTPS) page
|
||||||
"https://example.com/", # → Default config
|
"https://example.com/", # → Default config
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user