Implement dynamic configuration selection based on URL patterns to optimize crawling for different content types. This feature enables users to apply different crawling strategies (PDF extraction, content filtering, JavaScript execution) based on URL matching patterns. Key additions: - Add url_matcher and match_mode parameters to CrawlerRunConfig - Implement is_match() method supporting string patterns, functions, and mixed lists - Add MatchMode enum for OR/AND logic when combining multiple matchers - Update AsyncWebCrawler.arun_many() to accept List[CrawlerRunConfig] - Add select_config() method to dispatchers for runtime config selection - First matching config wins, with fallback to default Pattern matching supports: - Glob-style strings: *.pdf, */blog/*, *api* - Lambda functions: lambda url: 'github.com' in url - Mixed patterns with AND/OR logic for complex matching This enables optimal per-URL configuration: - PDFs: Use PDFContentScrapingStrategy without JavaScript - Blogs: Apply content filtering to reduce noise - APIs: Skip JavaScript, use JSON extraction - Dynamic sites: Execute only necessary JavaScript Breaking changes: None - fully backward compatible
117 lines
4.0 KiB
Python
117 lines
4.0 KiB
Python
"""
|
|
Test example for multiple crawler configs feature
|
|
"""
|
|
import asyncio
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode, CacheMode
|
|
|
|
async def test_multi_config():
|
|
# Create different configs for different URL patterns
|
|
|
|
# Config for PDF files
|
|
pdf_config = CrawlerRunConfig(
|
|
url_matcher="*.pdf",
|
|
)
|
|
|
|
# Config for articles (using multiple patterns with OR logic)
|
|
article_config = CrawlerRunConfig(
|
|
url_matcher=["*/news/*", "*blog*", "*/article/*"],
|
|
match_mode=MatchMode.OR,
|
|
screenshot=True,
|
|
)
|
|
|
|
# Config using custom matcher function
|
|
api_config = CrawlerRunConfig(
|
|
url_matcher=lambda url: 'api' in url or 'json' in url,
|
|
)
|
|
|
|
# Config combining patterns and functions with AND logic
|
|
secure_docs_config = CrawlerRunConfig(
|
|
url_matcher=[
|
|
"*.doc*", # Matches .doc, .docx
|
|
lambda url: url.startswith('https://') # Must be HTTPS
|
|
],
|
|
match_mode=MatchMode.AND,
|
|
)
|
|
|
|
# Default config (no url_matcher means it won't match anything unless it's the fallback)
|
|
default_config = CrawlerRunConfig(
|
|
# cache_mode=CacheMode.BYPASS,
|
|
)
|
|
|
|
# List of configs - order matters! First match wins
|
|
configs = [
|
|
pdf_config,
|
|
article_config,
|
|
api_config,
|
|
secure_docs_config,
|
|
default_config # Fallback
|
|
]
|
|
|
|
# Test URLs - using real URLs that exist
|
|
test_urls = [
|
|
# "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF
|
|
# "https://www.bbc.com/news/articles/c5y3e3glnldo", # News article
|
|
# "https://blog.python.org/", # Blog URL
|
|
# "https://api.github.com/users/github", # GitHub API (returns JSON)
|
|
# "https://httpbin.org/json", # API endpoint that returns JSON
|
|
# "https://www.python.org/", # Generic HTTPS page
|
|
# "http://info.cern.ch/", # HTTP (not HTTPS) page
|
|
"https://example.com/", # → Default config
|
|
]
|
|
|
|
# Test the matching logic
|
|
print("Config matching test:")
|
|
print("-" * 50)
|
|
for url in test_urls:
|
|
for i, config in enumerate(configs):
|
|
if config.is_match(url):
|
|
print(f"{url} -> Config {i} matches")
|
|
break
|
|
else:
|
|
print(f"{url} -> No match, will use fallback (first config)")
|
|
|
|
print("\n" + "=" * 50 + "\n")
|
|
|
|
# Now test with actual crawler
|
|
async with AsyncWebCrawler() as crawler:
|
|
# Single config - traditional usage still works
|
|
print("Test 1: Single config (backwards compatible)")
|
|
result = await crawler.arun_many(
|
|
urls=["https://www.python.org/"],
|
|
config=default_config
|
|
)
|
|
print(f"Crawled {len(result)} URLs with single config\n")
|
|
|
|
# Multiple configs - new feature
|
|
print("Test 2: Multiple configs")
|
|
# Just test with 2 URLs to avoid timeout
|
|
results = await crawler.arun_many(
|
|
urls=test_urls[:2], # Just test first 2 URLs
|
|
config=configs # Pass list of configs
|
|
)
|
|
print(f"Crawled {len(results)} URLs with multiple configs")
|
|
|
|
# Using custom matcher inline
|
|
print("\nTest 3: Inline custom matcher")
|
|
custom_config = CrawlerRunConfig(
|
|
url_matcher=lambda url: len(url) > 50 and 'python' in url.lower(),
|
|
verbose=False
|
|
)
|
|
results = await crawler.arun_many(
|
|
urls=[
|
|
"https://docs.python.org/3/library/asyncio.html", # Long URL with 'python'
|
|
"https://python.org/", # Short URL with 'python' - won't match
|
|
"https://www.google.com/" # No 'python' - won't match
|
|
],
|
|
config=[custom_config, default_config]
|
|
)
|
|
print(f"Crawled {len(results)} URLs with custom matcher")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test_multi_config()) |