feat: Add URL-specific crawler configurations for multi-URL crawling

Implement dynamic configuration selection based on URL patterns to optimize crawling for different content types. This feature enables users to apply different crawling strategies (PDF extraction, content filtering, JavaScript execution) based on URL matching patterns.

Key additions:
- Add url_matcher and match_mode parameters to CrawlerRunConfig
- Implement is_match() method supporting string patterns, functions, and mixed lists
- Add MatchMode enum for OR/AND logic when combining multiple matchers
- Update AsyncWebCrawler.arun_many() to accept List[CrawlerRunConfig]
- Add select_config() method to dispatchers for runtime config selection
- First matching config wins, with fallback to default

Pattern matching supports:
- Glob-style strings: *.pdf, */blog/*, *api*
- Lambda functions: lambda url: 'github.com' in url
- Mixed patterns with AND/OR logic for complex matching

This enables optimal per-URL configuration:
- PDFs: Use PDFContentScrapingStrategy without JavaScript
- Blogs: Apply content filtering to reduce noise
- APIs: Skip JavaScript, use JSON extraction
- Dynamic sites: Execute only necessary JavaScript

Breaking changes: None - fully backward compatible
This commit is contained in:
ntohidi
2025-08-02 19:10:36 +08:00
parent 864d87afb2
commit a03e68fa2f
13 changed files with 1096 additions and 20 deletions

42
tests/test_arun_many.py Normal file
View File

@@ -0,0 +1,42 @@
"""
Test example for multiple crawler configs feature
"""
import asyncio
import sys
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
async def test_run_many():
default_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
# scraping_strategy=PDFContentScrapingStrategy()
)
test_urls = [
# "https://blog.python.org/", # Blog URL
"https://www.python.org/", # Generic HTTPS page
"https://www.kidocode.com/", # Generic HTTPS page
"https://www.example.com/", # Generic HTTPS page
# "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
]
async with AsyncWebCrawler() as crawler:
# Single config - traditional usage still works
print("Test 1: Single config (backwards compatible)")
result = await crawler.arun_many(
urls=test_urls[:2],
config=default_config
)
print(f"Crawled {len(result)} URLs with single config\n")
for item in result:
print(f" {item.url} -> {item.status_code}")
if __name__ == "__main__":
asyncio.run(test_run_many())

View File

@@ -0,0 +1,131 @@
"""
Test only the config matching logic without running crawler
"""
import sys
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from crawl4ai.async_configs import CrawlerRunConfig, MatchMode
def test_all_matching_scenarios():
print("Testing CrawlerRunConfig.is_match() method")
print("=" * 50)
# Test 1: Single string pattern
print("\n1. Single string pattern (glob style)")
config = CrawlerRunConfig(
url_matcher="*.pdf",
# For example we can set this => scraping_strategy=PDFContentScrapingStrategy()
)
test_urls = [
("https://example.com/file.pdf", True),
("https://example.com/doc.PDF", False), # Case sensitive
("https://example.com/file.txt", False),
("file.pdf", True),
]
for url, expected in test_urls:
result = config.is_match(url)
status = "" if result == expected else ""
print(f" {status} {url} -> {result}")
# Test 2: List of patterns with OR
print("\n2. List of patterns with OR (default)")
config = CrawlerRunConfig(
url_matcher=["*/article/*", "*/blog/*", "*.html"],
match_mode=MatchMode.OR
)
test_urls = [
("https://example.com/article/news", True),
("https://example.com/blog/post", True),
("https://example.com/page.html", True),
("https://example.com/page.php", False),
]
for url, expected in test_urls:
result = config.is_match(url)
status = "" if result == expected else ""
print(f" {status} {url} -> {result}")
# Test 3: Custom function
print("\n3. Custom function matcher")
config = CrawlerRunConfig(
url_matcher=lambda url: 'api' in url and (url.endswith('.json') or url.endswith('.xml'))
)
test_urls = [
("https://api.example.com/data.json", True),
("https://api.example.com/data.xml", True),
("https://api.example.com/data.html", False),
("https://example.com/data.json", False), # No 'api'
]
for url, expected in test_urls:
result = config.is_match(url)
status = "" if result == expected else ""
print(f" {status} {url} -> {result}")
# Test 4: Mixed list with AND
print("\n4. Mixed patterns and functions with AND")
config = CrawlerRunConfig(
url_matcher=[
"https://*", # Must be HTTPS
lambda url: '.com' in url, # Must have .com
lambda url: len(url) < 50 # Must be short
],
match_mode=MatchMode.AND
)
test_urls = [
("https://example.com/page", True),
("http://example.com/page", False), # Not HTTPS
("https://example.org/page", False), # No .com
("https://example.com/" + "x" * 50, False), # Too long
]
for url, expected in test_urls:
result = config.is_match(url)
status = "" if result == expected else ""
print(f" {status} {url} -> {result}")
# Test 5: Complex real-world scenario
print("\n5. Complex pattern combinations")
config = CrawlerRunConfig(
url_matcher=[
"*/api/v[0-9]/*", # API versioned endpoints
lambda url: 'graphql' in url, # GraphQL endpoints
"*.json" # JSON files
],
match_mode=MatchMode.OR
)
test_urls = [
("https://example.com/api/v1/users", True),
("https://example.com/api/v2/posts", True),
("https://example.com/graphql", True),
("https://example.com/data.json", True),
("https://example.com/api/users", False), # No version
]
for url, expected in test_urls:
result = config.is_match(url)
status = "" if result == expected else ""
print(f" {status} {url} -> {result}")
# Test 6: Edge cases
print("\n6. Edge cases")
# No matcher
config = CrawlerRunConfig()
result = config.is_match("https://example.com")
print(f" {'' if not result else ''} No matcher -> {result}")
# Empty list
config = CrawlerRunConfig(url_matcher=[])
result = config.is_match("https://example.com")
print(f" {'' if not result else ''} Empty list -> {result}")
# None in list (should be skipped)
config = CrawlerRunConfig(url_matcher=["*.pdf", None, "*.doc"])
result = config.is_match("test.pdf")
print(f" {'' if result else ''} List with None -> {result}")
print("\n" + "=" * 50)
print("All matching tests completed!")
if __name__ == "__main__":
test_all_matching_scenarios()

View File

@@ -0,0 +1,87 @@
"""
Test config selection logic in dispatchers
"""
import asyncio
import sys
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from crawl4ai.async_configs import CrawlerRunConfig, MatchMode
from crawl4ai.async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher
class TestDispatcher(BaseDispatcher):
"""Simple test dispatcher to verify config selection"""
async def crawl_url(self, url, config, task_id, **kwargs):
# Just return which config was selected
selected = self.select_config(url, config)
return {"url": url, "config_id": id(selected)}
async def run_urls(self, urls, crawler, config):
results = []
for url in urls:
result = await self.crawl_url(url, config, "test")
results.append(result)
return results
async def test_dispatcher_config_selection():
print("Testing dispatcher config selection")
print("=" * 50)
# Create test configs with different matchers
pdf_config = CrawlerRunConfig(url_matcher="*.pdf")
api_config = CrawlerRunConfig(url_matcher=lambda url: 'api' in url)
default_config = CrawlerRunConfig() # No matcher
configs = [pdf_config, api_config, default_config]
# Create test dispatcher
dispatcher = TestDispatcher()
# Test single config
print("\nTest 1: Single config")
result = await dispatcher.crawl_url("https://example.com/file.pdf", pdf_config, "test1")
assert result["config_id"] == id(pdf_config)
print("✓ Single config works")
# Test config list selection
print("\nTest 2: Config list selection")
test_cases = [
("https://example.com/file.pdf", id(pdf_config)),
("https://api.example.com/data", id(api_config)),
("https://example.com/page", id(configs[0])), # No match, uses first
]
for url, expected_id in test_cases:
result = await dispatcher.crawl_url(url, configs, "test")
assert result["config_id"] == expected_id, f"URL {url} got wrong config"
print(f"{url} -> correct config selected")
# Test with MemoryAdaptiveDispatcher
print("\nTest 3: MemoryAdaptiveDispatcher config selection")
mem_dispatcher = MemoryAdaptiveDispatcher()
# Test select_config method directly
selected = mem_dispatcher.select_config("https://example.com/doc.pdf", configs)
assert selected == pdf_config
print("✓ MemoryAdaptiveDispatcher.select_config works")
# Test empty config list
print("\nTest 4: Edge cases")
selected = mem_dispatcher.select_config("https://example.com", [])
assert isinstance(selected, CrawlerRunConfig) # Should return default
print("✓ Empty config list returns default config")
# Test None config
selected = mem_dispatcher.select_config("https://example.com", None)
assert isinstance(selected, CrawlerRunConfig) # Should return default
print("✓ None config returns default config")
print("\n" + "=" * 50)
print("All dispatcher tests passed! ✓")
if __name__ == "__main__":
asyncio.run(test_dispatcher_config_selection())

117
tests/test_multi_config.py Normal file
View File

@@ -0,0 +1,117 @@
"""
Test example for multiple crawler configs feature
"""
import asyncio
import sys
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode, CacheMode
async def test_multi_config():
# Create different configs for different URL patterns
# Config for PDF files
pdf_config = CrawlerRunConfig(
url_matcher="*.pdf",
)
# Config for articles (using multiple patterns with OR logic)
article_config = CrawlerRunConfig(
url_matcher=["*/news/*", "*blog*", "*/article/*"],
match_mode=MatchMode.OR,
screenshot=True,
)
# Config using custom matcher function
api_config = CrawlerRunConfig(
url_matcher=lambda url: 'api' in url or 'json' in url,
)
# Config combining patterns and functions with AND logic
secure_docs_config = CrawlerRunConfig(
url_matcher=[
"*.doc*", # Matches .doc, .docx
lambda url: url.startswith('https://') # Must be HTTPS
],
match_mode=MatchMode.AND,
)
# Default config (no url_matcher means it won't match anything unless it's the fallback)
default_config = CrawlerRunConfig(
# cache_mode=CacheMode.BYPASS,
)
# List of configs - order matters! First match wins
configs = [
pdf_config,
article_config,
api_config,
secure_docs_config,
default_config # Fallback
]
# Test URLs - using real URLs that exist
test_urls = [
# "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF
# "https://www.bbc.com/news/articles/c5y3e3glnldo", # News article
# "https://blog.python.org/", # Blog URL
# "https://api.github.com/users/github", # GitHub API (returns JSON)
# "https://httpbin.org/json", # API endpoint that returns JSON
# "https://www.python.org/", # Generic HTTPS page
# "http://info.cern.ch/", # HTTP (not HTTPS) page
"https://example.com/", # → Default config
]
# Test the matching logic
print("Config matching test:")
print("-" * 50)
for url in test_urls:
for i, config in enumerate(configs):
if config.is_match(url):
print(f"{url} -> Config {i} matches")
break
else:
print(f"{url} -> No match, will use fallback (first config)")
print("\n" + "=" * 50 + "\n")
# Now test with actual crawler
async with AsyncWebCrawler() as crawler:
# Single config - traditional usage still works
print("Test 1: Single config (backwards compatible)")
result = await crawler.arun_many(
urls=["https://www.python.org/"],
config=default_config
)
print(f"Crawled {len(result)} URLs with single config\n")
# Multiple configs - new feature
print("Test 2: Multiple configs")
# Just test with 2 URLs to avoid timeout
results = await crawler.arun_many(
urls=test_urls[:2], # Just test first 2 URLs
config=configs # Pass list of configs
)
print(f"Crawled {len(results)} URLs with multiple configs")
# Using custom matcher inline
print("\nTest 3: Inline custom matcher")
custom_config = CrawlerRunConfig(
url_matcher=lambda url: len(url) > 50 and 'python' in url.lower(),
verbose=False
)
results = await crawler.arun_many(
urls=[
"https://docs.python.org/3/library/asyncio.html", # Long URL with 'python'
"https://python.org/", # Short URL with 'python' - won't match
"https://www.google.com/" # No 'python' - won't match
],
config=[custom_config, default_config]
)
print(f"Crawled {len(results)} URLs with custom matcher")
if __name__ == "__main__":
asyncio.run(test_multi_config())