Implement dynamic configuration selection based on URL patterns to optimize crawling for different content types. This feature enables users to apply different crawling strategies (PDF extraction, content filtering, JavaScript execution) based on URL matching patterns. Key additions: - Add url_matcher and match_mode parameters to CrawlerRunConfig - Implement is_match() method supporting string patterns, functions, and mixed lists - Add MatchMode enum for OR/AND logic when combining multiple matchers - Update AsyncWebCrawler.arun_many() to accept List[CrawlerRunConfig] - Add select_config() method to dispatchers for runtime config selection - First matching config wins, with fallback to default Pattern matching supports: - Glob-style strings: *.pdf, */blog/*, *api* - Lambda functions: lambda url: 'github.com' in url - Mixed patterns with AND/OR logic for complex matching This enables optimal per-URL configuration: - PDFs: Use PDFContentScrapingStrategy without JavaScript - Blogs: Apply content filtering to reduce noise - APIs: Skip JavaScript, use JSON extraction - Dynamic sites: Execute only necessary JavaScript Breaking changes: None - fully backward compatible
131 lines
4.5 KiB
Python
131 lines
4.5 KiB
Python
"""
|
|
Test only the config matching logic without running crawler
|
|
"""
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from crawl4ai.async_configs import CrawlerRunConfig, MatchMode
|
|
|
|
def test_all_matching_scenarios():
|
|
print("Testing CrawlerRunConfig.is_match() method")
|
|
print("=" * 50)
|
|
|
|
# Test 1: Single string pattern
|
|
print("\n1. Single string pattern (glob style)")
|
|
config = CrawlerRunConfig(
|
|
url_matcher="*.pdf",
|
|
# For example we can set this => scraping_strategy=PDFContentScrapingStrategy()
|
|
)
|
|
test_urls = [
|
|
("https://example.com/file.pdf", True),
|
|
("https://example.com/doc.PDF", False), # Case sensitive
|
|
("https://example.com/file.txt", False),
|
|
("file.pdf", True),
|
|
]
|
|
for url, expected in test_urls:
|
|
result = config.is_match(url)
|
|
status = "✓" if result == expected else "✗"
|
|
print(f" {status} {url} -> {result}")
|
|
|
|
# Test 2: List of patterns with OR
|
|
print("\n2. List of patterns with OR (default)")
|
|
config = CrawlerRunConfig(
|
|
url_matcher=["*/article/*", "*/blog/*", "*.html"],
|
|
match_mode=MatchMode.OR
|
|
)
|
|
test_urls = [
|
|
("https://example.com/article/news", True),
|
|
("https://example.com/blog/post", True),
|
|
("https://example.com/page.html", True),
|
|
("https://example.com/page.php", False),
|
|
]
|
|
for url, expected in test_urls:
|
|
result = config.is_match(url)
|
|
status = "✓" if result == expected else "✗"
|
|
print(f" {status} {url} -> {result}")
|
|
|
|
# Test 3: Custom function
|
|
print("\n3. Custom function matcher")
|
|
config = CrawlerRunConfig(
|
|
url_matcher=lambda url: 'api' in url and (url.endswith('.json') or url.endswith('.xml'))
|
|
)
|
|
test_urls = [
|
|
("https://api.example.com/data.json", True),
|
|
("https://api.example.com/data.xml", True),
|
|
("https://api.example.com/data.html", False),
|
|
("https://example.com/data.json", False), # No 'api'
|
|
]
|
|
for url, expected in test_urls:
|
|
result = config.is_match(url)
|
|
status = "✓" if result == expected else "✗"
|
|
print(f" {status} {url} -> {result}")
|
|
|
|
# Test 4: Mixed list with AND
|
|
print("\n4. Mixed patterns and functions with AND")
|
|
config = CrawlerRunConfig(
|
|
url_matcher=[
|
|
"https://*", # Must be HTTPS
|
|
lambda url: '.com' in url, # Must have .com
|
|
lambda url: len(url) < 50 # Must be short
|
|
],
|
|
match_mode=MatchMode.AND
|
|
)
|
|
test_urls = [
|
|
("https://example.com/page", True),
|
|
("http://example.com/page", False), # Not HTTPS
|
|
("https://example.org/page", False), # No .com
|
|
("https://example.com/" + "x" * 50, False), # Too long
|
|
]
|
|
for url, expected in test_urls:
|
|
result = config.is_match(url)
|
|
status = "✓" if result == expected else "✗"
|
|
print(f" {status} {url} -> {result}")
|
|
|
|
# Test 5: Complex real-world scenario
|
|
print("\n5. Complex pattern combinations")
|
|
config = CrawlerRunConfig(
|
|
url_matcher=[
|
|
"*/api/v[0-9]/*", # API versioned endpoints
|
|
lambda url: 'graphql' in url, # GraphQL endpoints
|
|
"*.json" # JSON files
|
|
],
|
|
match_mode=MatchMode.OR
|
|
)
|
|
test_urls = [
|
|
("https://example.com/api/v1/users", True),
|
|
("https://example.com/api/v2/posts", True),
|
|
("https://example.com/graphql", True),
|
|
("https://example.com/data.json", True),
|
|
("https://example.com/api/users", False), # No version
|
|
]
|
|
for url, expected in test_urls:
|
|
result = config.is_match(url)
|
|
status = "✓" if result == expected else "✗"
|
|
print(f" {status} {url} -> {result}")
|
|
|
|
# Test 6: Edge cases
|
|
print("\n6. Edge cases")
|
|
|
|
# No matcher
|
|
config = CrawlerRunConfig()
|
|
result = config.is_match("https://example.com")
|
|
print(f" {'✓' if not result else '✗'} No matcher -> {result}")
|
|
|
|
# Empty list
|
|
config = CrawlerRunConfig(url_matcher=[])
|
|
result = config.is_match("https://example.com")
|
|
print(f" {'✓' if not result else '✗'} Empty list -> {result}")
|
|
|
|
# None in list (should be skipped)
|
|
config = CrawlerRunConfig(url_matcher=["*.pdf", None, "*.doc"])
|
|
result = config.is_match("test.pdf")
|
|
print(f" {'✓' if result else '✗'} List with None -> {result}")
|
|
|
|
print("\n" + "=" * 50)
|
|
print("All matching tests completed!")
|
|
|
|
if __name__ == "__main__":
|
|
test_all_matching_scenarios() |