Fix critical issue where unmatched URLs incorrectly used the first config instead of failing safely. Also clarify that configs without url_matcher match ALL URLs by design, and improve memory usage monitoring. Bug fixes: - Change select_config() to return None when no config matches instead of using first config - Add proper error handling in dispatchers when no config matches a URL - Return failed CrawlResult with "No matching configuration found" error message - Fix is_match() to return True when url_matcher is None (matches all URLs) - Import and use get_true_memory_usage_percent() for more accurate memory monitoring Behavior clarification: - CrawlerRunConfig with url_matcher=None matches ALL URLs (not nothing) - This is the intended behavior for default/fallback configurations - Enables clean pattern: specific configs first, default config last Documentation updates: - Clarify that configs without url_matcher match everything - Explain "No matching configuration found" error when no default config - Add examples showing proper default config usage - Update all relevant docs: multi-url-crawling.md, arun_many.md, parameters.md - Simplify API config examples by removing extraction_strategy Demo and test updates: - Update demo_multi_config_clean.py with commented default config to show behavior - Change example URL to w3schools.com to demonstrate no-match scenario - Uncomment all test URLs in test_multi_config.py for comprehensive testing Breaking changes: None - this restores the intended behavior This ensures URLs only get processed with appropriate configs, preventing issues like HTML pages being processed with PDF extraction strategies.
117 lines
4.0 KiB
Python
117 lines
4.0 KiB
Python
"""
|
|
Test example for multiple crawler configs feature
|
|
"""
|
|
import asyncio
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode, CacheMode
|
|
|
|
async def test_multi_config():
|
|
# Create different configs for different URL patterns
|
|
|
|
# Config for PDF files
|
|
pdf_config = CrawlerRunConfig(
|
|
url_matcher="*.pdf",
|
|
)
|
|
|
|
# Config for articles (using multiple patterns with OR logic)
|
|
article_config = CrawlerRunConfig(
|
|
url_matcher=["*/news/*", "*blog*", "*/article/*"],
|
|
match_mode=MatchMode.OR,
|
|
screenshot=True,
|
|
)
|
|
|
|
# Config using custom matcher function
|
|
api_config = CrawlerRunConfig(
|
|
url_matcher=lambda url: 'api' in url or 'json' in url,
|
|
)
|
|
|
|
# Config combining patterns and functions with AND logic
|
|
secure_docs_config = CrawlerRunConfig(
|
|
url_matcher=[
|
|
"*.doc*", # Matches .doc, .docx
|
|
lambda url: url.startswith('https://') # Must be HTTPS
|
|
],
|
|
match_mode=MatchMode.AND,
|
|
)
|
|
|
|
# Default config (no url_matcher means it won't match anything unless it's the fallback)
|
|
default_config = CrawlerRunConfig(
|
|
# cache_mode=CacheMode.BYPASS,
|
|
)
|
|
|
|
# List of configs - order matters! First match wins
|
|
configs = [
|
|
pdf_config,
|
|
article_config,
|
|
api_config,
|
|
secure_docs_config,
|
|
default_config # Fallback
|
|
]
|
|
|
|
# Test URLs - using real URLs that exist
|
|
test_urls = [
|
|
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF
|
|
"https://www.bbc.com/news/articles/c5y3e3glnldo", # News article
|
|
"https://blog.python.org/", # Blog URL
|
|
"https://api.github.com/users/github", # GitHub API (returns JSON)
|
|
"https://httpbin.org/json", # API endpoint that returns JSON
|
|
"https://www.python.org/", # Generic HTTPS page
|
|
"http://info.cern.ch/", # HTTP (not HTTPS) page
|
|
"https://example.com/", # → Default config
|
|
]
|
|
|
|
# Test the matching logic
|
|
print("Config matching test:")
|
|
print("-" * 50)
|
|
for url in test_urls:
|
|
for i, config in enumerate(configs):
|
|
if config.is_match(url):
|
|
print(f"{url} -> Config {i} matches")
|
|
break
|
|
else:
|
|
print(f"{url} -> No match, will use fallback (first config)")
|
|
|
|
print("\n" + "=" * 50 + "\n")
|
|
|
|
# Now test with actual crawler
|
|
async with AsyncWebCrawler() as crawler:
|
|
# Single config - traditional usage still works
|
|
print("Test 1: Single config (backwards compatible)")
|
|
result = await crawler.arun_many(
|
|
urls=["https://www.python.org/"],
|
|
config=default_config
|
|
)
|
|
print(f"Crawled {len(result)} URLs with single config\n")
|
|
|
|
# Multiple configs - new feature
|
|
print("Test 2: Multiple configs")
|
|
# Just test with 2 URLs to avoid timeout
|
|
results = await crawler.arun_many(
|
|
urls=test_urls[:2], # Just test first 2 URLs
|
|
config=configs # Pass list of configs
|
|
)
|
|
print(f"Crawled {len(results)} URLs with multiple configs")
|
|
|
|
# Using custom matcher inline
|
|
print("\nTest 3: Inline custom matcher")
|
|
custom_config = CrawlerRunConfig(
|
|
url_matcher=lambda url: len(url) > 50 and 'python' in url.lower(),
|
|
verbose=False
|
|
)
|
|
results = await crawler.arun_many(
|
|
urls=[
|
|
"https://docs.python.org/3/library/asyncio.html", # Long URL with 'python'
|
|
"https://python.org/", # Short URL with 'python' - won't match
|
|
"https://www.google.com/" # No 'python' - won't match
|
|
],
|
|
config=[custom_config, default_config]
|
|
)
|
|
print(f"Crawled {len(results)} URLs with custom matcher")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test_multi_config()) |