diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index be3cab0a..766a80a0 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -3,7 +3,7 @@ import warnings from .async_webcrawler import AsyncWebCrawler, CacheMode # MODIFIED: Add SeedingConfig and VirtualScrollConfig here -from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig +from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode from .content_scraping_strategy import ( ContentScrapingStrategy, @@ -132,6 +132,7 @@ __all__ = [ "CrawlResult", "CrawlerHub", "CacheMode", + "MatchMode", "ContentScrapingStrategy", "WebScrapingStrategy", "LXMLWebScrapingStrategy", diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index d96916b4..7cab8f57 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -24,11 +24,18 @@ from .deep_crawling import DeepCrawlStrategy from .cache_context import CacheMode from .proxy_strategy import ProxyRotationStrategy -from typing import Union, List +from typing import Union, List, Callable import inspect from typing import Any, Dict, Optional from enum import Enum +# Type alias for URL matching +UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]] + +class MatchMode(Enum): + OR = "or" + AND = "and" + # from .proxy_strategy import ProxyConfig @@ -1113,6 +1120,9 @@ class CrawlerRunConfig(): link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None, # Virtual Scroll Parameters virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None, + # URL Matching Parameters + url_matcher: Optional[UrlMatcher] = None, + match_mode: MatchMode = MatchMode.OR, # Experimental Parameters experimental: Dict[str, Any] = None, ): @@ -1266,6 +1276,10 @@ class CrawlerRunConfig(): else: raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict") + # URL Matching Parameters + self.url_matcher = url_matcher + self.match_mode = match_mode + # Experimental Parameters self.experimental = experimental or {} @@ -1321,6 +1335,51 @@ class CrawlerRunConfig(): if "compilation error" not in str(e).lower(): raise ValueError(f"Failed to compile C4A script: {str(e)}") raise + + def is_match(self, url: str) -> bool: + """Check if this config matches the given URL. + + Args: + url: The URL to check against this config's matcher + + Returns: + bool: True if this config should be used for the URL + """ + if self.url_matcher is None: + return False + + if callable(self.url_matcher): + # Single function matcher + return self.url_matcher(url) + + elif isinstance(self.url_matcher, str): + # Single pattern string + from fnmatch import fnmatch + return fnmatch(url, self.url_matcher) + + elif isinstance(self.url_matcher, list): + # List of mixed matchers + if not self.url_matcher: # Empty list + return False + + results = [] + for matcher in self.url_matcher: + if callable(matcher): + results.append(matcher(url)) + elif isinstance(matcher, str): + from fnmatch import fnmatch + results.append(fnmatch(url, matcher)) + else: + # Skip invalid matchers + continue + + # Apply match mode logic + if self.match_mode == MatchMode.OR: + return any(results) if results else False + else: # AND mode + return all(results) if results else False + + return False def __getattr__(self, name): @@ -1443,6 +1502,9 @@ class CrawlerRunConfig(): # Link Extraction Parameters link_preview_config=kwargs.get("link_preview_config"), url=kwargs.get("url"), + # URL Matching Parameters + url_matcher=kwargs.get("url_matcher"), + match_mode=kwargs.get("match_mode", MatchMode.OR), # Experimental Parameters experimental=kwargs.get("experimental"), ) @@ -1540,6 +1602,8 @@ class CrawlerRunConfig(): "deep_crawl_strategy": self.deep_crawl_strategy, "link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None, "url": self.url, + "url_matcher": self.url_matcher, + "match_mode": self.match_mode, "experimental": self.experimental, } diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py index 1558efc0..77739616 100644 --- a/crawl4ai/async_dispatcher.py +++ b/crawl4ai/async_dispatcher.py @@ -1,4 +1,4 @@ -from typing import Dict, Optional, List, Tuple +from typing import Dict, Optional, List, Tuple, Union from .async_configs import CrawlerRunConfig from .models import ( CrawlResult, @@ -96,11 +96,37 @@ class BaseDispatcher(ABC): self.rate_limiter = rate_limiter self.monitor = monitor + def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> CrawlerRunConfig: + """Select the appropriate config for a given URL. + + Args: + url: The URL to match against + configs: Single config or list of configs to choose from + + Returns: + The matching config, or the first config if no match, or a default config if empty list + """ + # Single config - return as is + if isinstance(configs, CrawlerRunConfig): + return configs + + # Empty list - return default config + if not configs: + return CrawlerRunConfig() + + # Find first matching config + for config in configs: + if config.is_match(url): + return config + + # No match found - return first config as fallback + return configs[0] + @abstractmethod async def crawl_url( self, url: str, - config: CrawlerRunConfig, + config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], task_id: str, monitor: Optional[CrawlerMonitor] = None, ) -> CrawlerTaskResult: @@ -111,7 +137,7 @@ class BaseDispatcher(ABC): self, urls: List[str], crawler: AsyncWebCrawler, # noqa: F821 - config: CrawlerRunConfig, + config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], monitor: Optional[CrawlerMonitor] = None, ) -> List[CrawlerTaskResult]: pass @@ -200,7 +226,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): async def crawl_url( self, url: str, - config: CrawlerRunConfig, + config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], task_id: str, retry_count: int = 0, ) -> CrawlerTaskResult: @@ -208,6 +234,9 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): error_message = "" memory_usage = peak_memory = 0.0 + # Select appropriate config for this URL + selected_config = self.select_config(url, config) + # Get starting memory for accurate measurement process = psutil.Process() start_memory = process.memory_info().rss / (1024 * 1024) @@ -257,8 +286,8 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): retry_count=retry_count + 1 ) - # Execute the crawl - result = await self.crawler.arun(url, config=config, session_id=task_id) + # Execute the crawl with selected config + result = await self.crawler.arun(url, config=selected_config, session_id=task_id) # Measure memory usage end_memory = process.memory_info().rss / (1024 * 1024) @@ -316,7 +345,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): self, urls: List[str], crawler: AsyncWebCrawler, - config: CrawlerRunConfig, + config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], ) -> List[CrawlerTaskResult]: self.crawler = crawler @@ -470,7 +499,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): self, urls: List[str], crawler: AsyncWebCrawler, - config: CrawlerRunConfig, + config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], ) -> AsyncGenerator[CrawlerTaskResult, None]: self.crawler = crawler @@ -572,7 +601,7 @@ class SemaphoreDispatcher(BaseDispatcher): async def crawl_url( self, url: str, - config: CrawlerRunConfig, + config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], task_id: str, semaphore: asyncio.Semaphore = None, ) -> CrawlerTaskResult: @@ -580,6 +609,9 @@ class SemaphoreDispatcher(BaseDispatcher): error_message = "" memory_usage = peak_memory = 0.0 + # Select appropriate config for this URL + selected_config = self.select_config(url, config) + try: if self.monitor: self.monitor.update_task( @@ -592,7 +624,7 @@ class SemaphoreDispatcher(BaseDispatcher): async with semaphore: process = psutil.Process() start_memory = process.memory_info().rss / (1024 * 1024) - result = await self.crawler.arun(url, config=config, session_id=task_id) + result = await self.crawler.arun(url, config=selected_config, session_id=task_id) end_memory = process.memory_info().rss / (1024 * 1024) memory_usage = peak_memory = end_memory - start_memory @@ -654,7 +686,7 @@ class SemaphoreDispatcher(BaseDispatcher): self, crawler: AsyncWebCrawler, # noqa: F821 urls: List[str], - config: CrawlerRunConfig, + config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], ) -> List[CrawlerTaskResult]: self.crawler = crawler if self.monitor: diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ed34ea50..ebd2859d 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -653,7 +653,7 @@ class AsyncWebCrawler: async def arun_many( self, urls: List[str], - config: Optional[CrawlerRunConfig] = None, + config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None, dispatcher: Optional[BaseDispatcher] = None, # Legacy parameters maintained for backwards compatibility # word_count_threshold=MIN_WORD_THRESHOLD, @@ -674,7 +674,9 @@ class AsyncWebCrawler: Args: urls: List of URLs to crawl - config: Configuration object controlling crawl behavior for all URLs + config: Configuration object(s) controlling crawl behavior. Can be: + - Single CrawlerRunConfig: Used for all URLs + - List[CrawlerRunConfig]: Configs with url_matcher for URL-specific settings dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher [other parameters maintained for backwards compatibility] @@ -739,7 +741,11 @@ class AsyncWebCrawler: or task_result.result ) - stream = config.stream + # Handle stream setting - use first config's stream setting if config is a list + if isinstance(config, list): + stream = config[0].stream if config else False + else: + stream = config.stream if stream: diff --git a/docs/examples/demo_multi_config_clean.py b/docs/examples/demo_multi_config_clean.py new file mode 100644 index 00000000..fb3a72ed --- /dev/null +++ b/docs/examples/demo_multi_config_clean.py @@ -0,0 +1,304 @@ +""" +🎯 Multi-Config URL Matching Demo +================================= +Learn how to use different crawler configurations for different URL patterns +in a single crawl batch with Crawl4AI's multi-config feature. + +Part 1: Understanding URL Matching (Pattern Testing) +Part 2: Practical Example with Real Crawling +""" + +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + CrawlerRunConfig, + MatchMode +) +from crawl4ai.processors.pdf import PDFContentScrapingStrategy +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + + +def print_section(title): + """Print a formatted section header""" + print(f"\n{'=' * 60}") + print(f"{title}") + print(f"{'=' * 60}\n") + + +def test_url_matching(config, test_urls, config_name): + """Test URL matching for a config and show results""" + print(f"Config: {config_name}") + print(f"Matcher: {config.url_matcher}") + if hasattr(config, 'match_mode'): + print(f"Mode: {config.match_mode.value}") + print("-" * 40) + + for url in test_urls: + matches = config.is_match(url) + symbol = "βœ“" if matches else "βœ—" + print(f"{symbol} {url}") + print() + + +# ============================================================================== +# PART 1: Understanding URL Matching +# ============================================================================== + +def demo_part1_pattern_matching(): + """Part 1: Learn how URL matching works without crawling""" + + print_section("PART 1: Understanding URL Matching") + print("Let's explore different ways to match URLs with configs.\n") + + # Test URLs we'll use throughout + test_urls = [ + "https://example.com/report.pdf", + "https://example.com/data.json", + "https://example.com/blog/post-1", + "https://example.com/article/news", + "https://api.example.com/v1/users", + "https://example.com/about" + ] + + # 1.1 Simple String Pattern + print("1.1 Simple String Pattern Matching") + print("-" * 40) + + pdf_config = CrawlerRunConfig( + url_matcher="*.pdf" + ) + + test_url_matching(pdf_config, test_urls, "PDF Config") + + + # 1.2 Multiple String Patterns + print("1.2 Multiple String Patterns (OR logic)") + print("-" * 40) + + blog_config = CrawlerRunConfig( + url_matcher=["*/blog/*", "*/article/*", "*/news/*"], + match_mode=MatchMode.OR # This is default, shown for clarity + ) + + test_url_matching(blog_config, test_urls, "Blog/Article Config") + + + # 1.3 Single Function Matcher + print("1.3 Function-based Matching") + print("-" * 40) + + api_config = CrawlerRunConfig( + url_matcher=lambda url: 'api' in url or url.endswith('.json') + ) + + test_url_matching(api_config, test_urls, "API Config") + + + # 1.4 List of Functions + print("1.4 Multiple Functions with AND Logic") + print("-" * 40) + + # Must be HTTPS AND contain 'api' AND have version number + secure_api_config = CrawlerRunConfig( + url_matcher=[ + lambda url: url.startswith('https://'), + lambda url: 'api' in url, + lambda url: '/v' in url # Version indicator + ], + match_mode=MatchMode.AND + ) + + test_url_matching(secure_api_config, test_urls, "Secure API Config") + + + # 1.5 Mixed: String and Function Together + print("1.5 Mixed Patterns: String + Function") + print("-" * 40) + + # Match JSON files OR any API endpoint + json_or_api_config = CrawlerRunConfig( + url_matcher=[ + "*.json", # String pattern + lambda url: 'api' in url # Function + ], + match_mode=MatchMode.OR + ) + + test_url_matching(json_or_api_config, test_urls, "JSON or API Config") + + + # 1.6 Complex: Multiple Strings + Multiple Functions + print("1.6 Complex Matcher: Mixed Types with AND Logic") + print("-" * 40) + + # Must be: HTTPS AND (.com domain) AND (blog OR article) AND NOT a PDF + complex_config = CrawlerRunConfig( + url_matcher=[ + lambda url: url.startswith('https://'), # Function: HTTPS check + "*.com/*", # String: .com domain + lambda url: any(pattern in url for pattern in ['/blog/', '/article/']), # Function: Blog OR article + lambda url: not url.endswith('.pdf') # Function: Not PDF + ], + match_mode=MatchMode.AND + ) + + test_url_matching(complex_config, test_urls, "Complex Mixed Config") + + print("\nβœ… Key Takeaway: First matching config wins when passed to arun_many()!") + + +# ============================================================================== +# PART 2: Practical Multi-URL Crawling +# ============================================================================== + +async def demo_part2_practical_crawling(): + """Part 2: Real-world example with different content types""" + + print_section("PART 2: Practical Multi-URL Crawling") + print("Now let's see multi-config in action with real URLs.\n") + + # Create specialized configs for different content types + configs = [ + # Config 1: PDF documents - only match files ending with .pdf + CrawlerRunConfig( + url_matcher="*.pdf", + scraping_strategy=PDFContentScrapingStrategy() + ), + + # Config 2: Blog/article pages with content filtering + CrawlerRunConfig( + url_matcher=["*/blog/*", "*/article/*", "*python.org*"], + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48) + ) + ), + + # Config 3: Dynamic pages requiring JavaScript + CrawlerRunConfig( + url_matcher=lambda url: 'github.com' in url, + js_code="window.scrollTo(0, 500);" # Scroll to load content + ), + + # Config 4: Mixed matcher - API endpoints (string OR function) + CrawlerRunConfig( + url_matcher=[ + "*.json", # String pattern for JSON files + lambda url: 'api' in url or 'httpbin.org' in url # Function for API endpoints + ], + match_mode=MatchMode.OR, + extraction_strategy=JsonCssExtractionStrategy({"data": "body"}) + ), + + # Config 5: Complex matcher - Secure documentation sites + CrawlerRunConfig( + url_matcher=[ + lambda url: url.startswith('https://'), # Must be HTTPS + "*.org/*", # String: .org domain + lambda url: any(doc in url for doc in ['docs', 'documentation', 'reference']), # Has docs + lambda url: not url.endswith(('.pdf', '.json')) # Not PDF or JSON + ], + match_mode=MatchMode.AND, + wait_for="css:.content, css:article" # Wait for content to load + ), + + # Default config for everything else + CrawlerRunConfig() # No url_matcher means it never matches (except as fallback) + ] + + # URLs to crawl - each will use a different config + urls = [ + "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # β†’ PDF config + "https://blog.python.org/", # β†’ Blog config with content filter + "https://github.com/microsoft/playwright", # β†’ JS config + "https://httpbin.org/json", # β†’ Mixed matcher config (API) + "https://docs.python.org/3/reference/", # β†’ Complex matcher config + "https://example.com/", # β†’ Default config + ] + + print("URLs to crawl:") + for i, url in enumerate(urls, 1): + print(f"{i}. {url}") + + print("\nCrawling with appropriate config for each URL...\n") + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=urls, + config=configs + ) + + # Display results + print("Results:") + print("-" * 60) + + for result in results: + if result.success: + # Determine which config was used + config_type = "Default" + if result.url.endswith('.pdf'): + config_type = "PDF Strategy" + elif any(pattern in result.url for pattern in ['blog', 'python.org']) and 'docs' not in result.url: + config_type = "Blog + Content Filter" + elif 'github.com' in result.url: + config_type = "JavaScript Enabled" + elif 'httpbin.org' in result.url or result.url.endswith('.json'): + config_type = "Mixed Matcher (API)" + elif 'docs.python.org' in result.url: + config_type = "Complex Matcher (Secure Docs)" + + print(f"\nβœ“ {result.url}") + print(f" Config used: {config_type}") + print(f" Content size: {len(result.markdown)} chars") + + # Show if we have fit_markdown (from content filter) + if hasattr(result.markdown, 'fit_markdown') and result.markdown.fit_markdown: + print(f" Fit markdown size: {len(result.markdown.fit_markdown)} chars") + reduction = (1 - len(result.markdown.fit_markdown) / len(result.markdown)) * 100 + print(f" Content reduced by: {reduction:.1f}%") + + # Show extracted data if using extraction strategy + if hasattr(result, 'extracted_content') and result.extracted_content: + print(f" Extracted data: {str(result.extracted_content)[:100]}...") + else: + print(f"\nβœ— {result.url}") + print(f" Error: {result.error_message}") + + print("\n" + "=" * 60) + print("βœ… Multi-config crawling complete!") + print("\nBenefits demonstrated:") + print("- PDFs handled with specialized scraper") + print("- Blog content filtered for relevance") + print("- JavaScript executed only where needed") + print("- Mixed matchers (string + function) for flexible matching") + print("- Complex matchers for precise URL targeting") + print("- Each URL got optimal configuration automatically!") + + +async def main(): + """Run both parts of the demo""" + + print(""" +🎯 Multi-Config URL Matching Demo +================================= +Learn how Crawl4AI can use different configurations +for different URLs in a single batch. + """) + + # Part 1: Pattern matching + demo_part1_pattern_matching() + + print("\nPress Enter to continue to Part 2...") + try: + input() + except EOFError: + # Running in non-interactive mode, skip input + pass + + # Part 2: Practical crawling + await demo_part2_practical_crawling() + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/md_v2/advanced/multi-url-crawling.md b/docs/md_v2/advanced/multi-url-crawling.md index 40493c21..aa4241fe 100644 --- a/docs/md_v2/advanced/multi-url-crawling.md +++ b/docs/md_v2/advanced/multi-url-crawling.md @@ -404,7 +404,174 @@ for result in results: print(f"Duration: {dr.end_time - dr.start_time}") ``` -## 6. Summary +## 6. URL-Specific Configurations + +When crawling diverse content types, you often need different configurations for different URLs. For example: +- PDFs need specialized extraction +- Blog pages benefit from content filtering +- Dynamic sites need JavaScript execution +- API endpoints need JSON parsing + +### 6.1 Basic URL Pattern Matching + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode +from crawl4ai.processors.pdf import PDFContentScrapingStrategy +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def crawl_mixed_content(): + # Configure different strategies for different content + configs = [ + # PDF files - specialized extraction + CrawlerRunConfig( + url_matcher="*.pdf", + scraping_strategy=PDFContentScrapingStrategy() + ), + + # Blog/article pages - content filtering + CrawlerRunConfig( + url_matcher=["*/blog/*", "*/article/*"], + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48) + ) + ), + + # Dynamic pages - JavaScript execution + CrawlerRunConfig( + url_matcher=lambda url: 'github.com' in url, + js_code="window.scrollTo(0, 500);" + ), + + # API endpoints - JSON extraction + CrawlerRunConfig( + url_matcher=lambda url: 'api' in url or url.endswith('.json'), + extraction_strategy=JsonCssExtractionStrategy({"data": "body"}) + ), + + # Default config for everything else + CrawlerRunConfig() # No url_matcher = fallback + ] + + # Mixed URLs + urls = [ + "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", + "https://blog.python.org/", + "https://github.com/microsoft/playwright", + "https://httpbin.org/json", + "https://example.com/" + ] + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=urls, + config=configs # Pass list of configs + ) + + for result in results: + print(f"{result.url}: {len(result.markdown)} chars") +``` + +### 6.2 Advanced Pattern Matching + +The `url_matcher` parameter supports three types of patterns: + +#### Glob Patterns (Strings) +```python +# Simple patterns +"*.pdf" # Any PDF file +"*/api/*" # Any URL with /api/ in path +"https://*.example.com/*" # Subdomain matching +"*://example.com/blog/*" # Any protocol +``` + +#### Custom Functions +```python +# Complex logic with lambdas +lambda url: url.startswith('https://') and 'secure' in url +lambda url: len(url) > 50 and url.count('/') > 5 +lambda url: any(domain in url for domain in ['api.', 'data.', 'feed.']) +``` + +#### Mixed Lists with AND/OR Logic +```python +# Combine multiple conditions +CrawlerRunConfig( + url_matcher=[ + "https://*", # Must be HTTPS + lambda url: 'internal' in url, # Must contain 'internal' + lambda url: not url.endswith('.pdf') # Must not be PDF + ], + match_mode=MatchMode.AND # ALL conditions must match +) +``` + +### 6.3 Practical Example: News Site Crawler + +```python +async def crawl_news_site(): + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=70.0, + rate_limiter=RateLimiter(base_delay=(1.0, 2.0)) + ) + + configs = [ + # Homepage - light extraction + CrawlerRunConfig( + url_matcher=lambda url: url.rstrip('/') == 'https://news.ycombinator.com', + css_selector="nav, .headline", + extraction_strategy=None + ), + + # Article pages - full extraction + CrawlerRunConfig( + url_matcher="*/article/*", + extraction_strategy=CosineStrategy( + semantic_filter="article content", + word_count_threshold=100 + ), + screenshot=True, + excluded_tags=["nav", "aside", "footer"] + ), + + # Author pages - metadata focus + CrawlerRunConfig( + url_matcher="*/author/*", + extraction_strategy=JsonCssExtractionStrategy({ + "name": "h1.author-name", + "bio": ".author-bio", + "articles": "article.post-card h2" + }) + ), + + # Everything else + CrawlerRunConfig() + ] + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=news_urls, + config=configs, + dispatcher=dispatcher + ) +``` + +### 6.4 Best Practices + +1. **Order Matters**: Configs are evaluated in order - put specific patterns before general ones +2. **Always Include a Default**: Last config should have no `url_matcher` as a fallback +3. **Test Your Patterns**: Use the config's `is_match()` method to test patterns: + ```python + config = CrawlerRunConfig(url_matcher="*/api/*") + print(config.is_match("https://example.com/api/users")) # True + ``` +4. **Optimize for Performance**: + - Disable JS for static content + - Skip screenshots for data APIs + - Use appropriate extraction strategies + +## 7. Summary 1. **Two Dispatcher Types**: diff --git a/docs/md_v2/api/arun_many.md b/docs/md_v2/api/arun_many.md index 98d91c08..a233a0f9 100644 --- a/docs/md_v2/api/arun_many.md +++ b/docs/md_v2/api/arun_many.md @@ -7,7 +7,7 @@ ```python async def arun_many( urls: Union[List[str], List[Any]], - config: Optional[CrawlerRunConfig] = None, + config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None, dispatcher: Optional[BaseDispatcher] = None, ... ) -> Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]: @@ -15,7 +15,9 @@ async def arun_many( Crawl multiple URLs concurrently or in batches. :param urls: A list of URLs (or tasks) to crawl. - :param config: (Optional) A default `CrawlerRunConfig` applying to each crawl. + :param config: (Optional) Either: + - A single `CrawlerRunConfig` applying to all URLs + - A list of `CrawlerRunConfig` objects with url_matcher patterns :param dispatcher: (Optional) A concurrency controller (e.g.β€€MemoryAdaptiveDispatcher). ... :return: Either a list of `CrawlResult` objects, or an async generator if streaming is enabled. @@ -95,6 +97,65 @@ results = await crawler.arun_many( ) ``` +### URL-Specific Configurations + +Instead of using one config for all URLs, provide a list of configs with `url_matcher` patterns: + +```python +from crawl4ai import CrawlerRunConfig, MatchMode +from crawl4ai.processors.pdf import PDFContentScrapingStrategy +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +# PDF files - specialized extraction +pdf_config = CrawlerRunConfig( + url_matcher="*.pdf", + scraping_strategy=PDFContentScrapingStrategy() +) + +# Blog/article pages - content filtering +blog_config = CrawlerRunConfig( + url_matcher=["*/blog/*", "*/article/*", "*python.org*"], + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48) + ) +) + +# Dynamic pages - JavaScript execution +github_config = CrawlerRunConfig( + url_matcher=lambda url: 'github.com' in url, + js_code="window.scrollTo(0, 500);" +) + +# API endpoints - JSON extraction +api_config = CrawlerRunConfig( + url_matcher=lambda url: 'api' in url or url.endswith('.json'), + extraction_strategy=JsonCssExtractionStrategy({"data": "body"}) +) + +# Default fallback config +default_config = CrawlerRunConfig() # No url_matcher means it never matches except as fallback + +# Pass the list of configs - first match wins! +results = await crawler.arun_many( + urls=[ + "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # β†’ pdf_config + "https://blog.python.org/", # β†’ blog_config + "https://github.com/microsoft/playwright", # β†’ github_config + "https://httpbin.org/json", # β†’ api_config + "https://example.com/" # β†’ default_config + ], + config=[pdf_config, blog_config, github_config, api_config, default_config] +) +``` + +**URL Matching Features**: +- **String patterns**: `"*.pdf"`, `"*/blog/*"`, `"*python.org*"` +- **Function matchers**: `lambda url: 'api' in url` +- **Mixed patterns**: Combine strings and functions with `MatchMode.OR` or `MatchMode.AND` +- **First match wins**: Configs are evaluated in order + **Key Points**: - Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy. - `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.β€€ diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 09094460..b6b4e402 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -208,6 +208,64 @@ config = CrawlerRunConfig( See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detailed examples. +--- + +### I) **URL Matching Configuration** + +| **Parameter** | **Type / Default** | **What It Does** | +|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| **`url_matcher`** | `UrlMatcher` (None) | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types | +| **`match_mode`** | `MatchMode` (MatchMode.OR) | How to combine multiple matchers in a list: `MatchMode.OR` (any match) or `MatchMode.AND` (all must match) | + +The `url_matcher` parameter enables URL-specific configurations when used with `arun_many()`: + +```python +from crawl4ai import CrawlerRunConfig, MatchMode +from crawl4ai.processors.pdf import PDFContentScrapingStrategy +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +# Simple string pattern (glob-style) +pdf_config = CrawlerRunConfig( + url_matcher="*.pdf", + scraping_strategy=PDFContentScrapingStrategy() +) + +# Multiple patterns with OR logic (default) +blog_config = CrawlerRunConfig( + url_matcher=["*/blog/*", "*/article/*", "*/news/*"], + match_mode=MatchMode.OR # Any pattern matches +) + +# Function matcher +api_config = CrawlerRunConfig( + url_matcher=lambda url: 'api' in url or url.endswith('.json'), + extraction_strategy=JsonCssExtractionStrategy({"data": "body"}) +) + +# Mixed: String + Function with AND logic +complex_config = CrawlerRunConfig( + url_matcher=[ + lambda url: url.startswith('https://'), # Must be HTTPS + "*.org/*", # Must be .org domain + lambda url: 'docs' in url # Must contain 'docs' + ], + match_mode=MatchMode.AND # ALL conditions must match +) + +# Combined patterns and functions with AND logic +secure_docs = CrawlerRunConfig( + url_matcher=["https://*", lambda url: '.doc' in url], + match_mode=MatchMode.AND # Must be HTTPS AND contain .doc +) +``` + +**UrlMatcher Types:** +- **String patterns**: Glob-style patterns like `"*.pdf"`, `"*/api/*"`, `"https://*.example.com/*"` +- **Functions**: `lambda url: bool` - Custom logic for complex matching +- **Lists**: Mix strings and functions, combined with `MatchMode.OR` or `MatchMode.AND` + +When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins! + ---## 2.2 Helper Methods Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies: diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index 6bd5797a..34a69d7b 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -209,7 +209,13 @@ class CrawlerRunConfig: - The maximum number of concurrent crawl sessions. - Helps prevent overwhelming the system. -14. **`display_mode`**: +14. **`url_matcher`** & **`match_mode`**: + - Enable URL-specific configurations when used with `arun_many()`. + - Set `url_matcher` to patterns (glob, function, or list) to match specific URLs. + - Use `match_mode` (OR/AND) to control how multiple patterns combine. + - See [URL-Specific Configurations](../api/arun_many.md#url-specific-configurations) for examples. + +15. **`display_mode`**: - The display mode for progress information (`DETAILED`, `BRIEF`, etc.). - Affects how much information is printed during the crawl. diff --git a/tests/test_arun_many.py b/tests/test_arun_many.py new file mode 100644 index 00000000..2a315a2a --- /dev/null +++ b/tests/test_arun_many.py @@ -0,0 +1,42 @@ +""" +Test example for multiple crawler configs feature +""" +import asyncio +import sys +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai.processors.pdf import PDFContentScrapingStrategy + + +async def test_run_many(): + default_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + # scraping_strategy=PDFContentScrapingStrategy() + ) + + test_urls = [ + # "https://blog.python.org/", # Blog URL + "https://www.python.org/", # Generic HTTPS page + "https://www.kidocode.com/", # Generic HTTPS page + "https://www.example.com/", # Generic HTTPS page + # "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", + ] + + async with AsyncWebCrawler() as crawler: + # Single config - traditional usage still works + print("Test 1: Single config (backwards compatible)") + result = await crawler.arun_many( + urls=test_urls[:2], + config=default_config + ) + print(f"Crawled {len(result)} URLs with single config\n") + for item in result: + print(f" {item.url} -> {item.status_code}") + + +if __name__ == "__main__": + asyncio.run(test_run_many()) diff --git a/tests/test_config_matching_only.py b/tests/test_config_matching_only.py new file mode 100644 index 00000000..0f21666e --- /dev/null +++ b/tests/test_config_matching_only.py @@ -0,0 +1,131 @@ +""" +Test only the config matching logic without running crawler +""" +import sys +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from crawl4ai.async_configs import CrawlerRunConfig, MatchMode + +def test_all_matching_scenarios(): + print("Testing CrawlerRunConfig.is_match() method") + print("=" * 50) + + # Test 1: Single string pattern + print("\n1. Single string pattern (glob style)") + config = CrawlerRunConfig( + url_matcher="*.pdf", + # For example we can set this => scraping_strategy=PDFContentScrapingStrategy() + ) + test_urls = [ + ("https://example.com/file.pdf", True), + ("https://example.com/doc.PDF", False), # Case sensitive + ("https://example.com/file.txt", False), + ("file.pdf", True), + ] + for url, expected in test_urls: + result = config.is_match(url) + status = "βœ“" if result == expected else "βœ—" + print(f" {status} {url} -> {result}") + + # Test 2: List of patterns with OR + print("\n2. List of patterns with OR (default)") + config = CrawlerRunConfig( + url_matcher=["*/article/*", "*/blog/*", "*.html"], + match_mode=MatchMode.OR + ) + test_urls = [ + ("https://example.com/article/news", True), + ("https://example.com/blog/post", True), + ("https://example.com/page.html", True), + ("https://example.com/page.php", False), + ] + for url, expected in test_urls: + result = config.is_match(url) + status = "βœ“" if result == expected else "βœ—" + print(f" {status} {url} -> {result}") + + # Test 3: Custom function + print("\n3. Custom function matcher") + config = CrawlerRunConfig( + url_matcher=lambda url: 'api' in url and (url.endswith('.json') or url.endswith('.xml')) + ) + test_urls = [ + ("https://api.example.com/data.json", True), + ("https://api.example.com/data.xml", True), + ("https://api.example.com/data.html", False), + ("https://example.com/data.json", False), # No 'api' + ] + for url, expected in test_urls: + result = config.is_match(url) + status = "βœ“" if result == expected else "βœ—" + print(f" {status} {url} -> {result}") + + # Test 4: Mixed list with AND + print("\n4. Mixed patterns and functions with AND") + config = CrawlerRunConfig( + url_matcher=[ + "https://*", # Must be HTTPS + lambda url: '.com' in url, # Must have .com + lambda url: len(url) < 50 # Must be short + ], + match_mode=MatchMode.AND + ) + test_urls = [ + ("https://example.com/page", True), + ("http://example.com/page", False), # Not HTTPS + ("https://example.org/page", False), # No .com + ("https://example.com/" + "x" * 50, False), # Too long + ] + for url, expected in test_urls: + result = config.is_match(url) + status = "βœ“" if result == expected else "βœ—" + print(f" {status} {url} -> {result}") + + # Test 5: Complex real-world scenario + print("\n5. Complex pattern combinations") + config = CrawlerRunConfig( + url_matcher=[ + "*/api/v[0-9]/*", # API versioned endpoints + lambda url: 'graphql' in url, # GraphQL endpoints + "*.json" # JSON files + ], + match_mode=MatchMode.OR + ) + test_urls = [ + ("https://example.com/api/v1/users", True), + ("https://example.com/api/v2/posts", True), + ("https://example.com/graphql", True), + ("https://example.com/data.json", True), + ("https://example.com/api/users", False), # No version + ] + for url, expected in test_urls: + result = config.is_match(url) + status = "βœ“" if result == expected else "βœ—" + print(f" {status} {url} -> {result}") + + # Test 6: Edge cases + print("\n6. Edge cases") + + # No matcher + config = CrawlerRunConfig() + result = config.is_match("https://example.com") + print(f" {'βœ“' if not result else 'βœ—'} No matcher -> {result}") + + # Empty list + config = CrawlerRunConfig(url_matcher=[]) + result = config.is_match("https://example.com") + print(f" {'βœ“' if not result else 'βœ—'} Empty list -> {result}") + + # None in list (should be skipped) + config = CrawlerRunConfig(url_matcher=["*.pdf", None, "*.doc"]) + result = config.is_match("test.pdf") + print(f" {'βœ“' if result else 'βœ—'} List with None -> {result}") + + print("\n" + "=" * 50) + print("All matching tests completed!") + +if __name__ == "__main__": + test_all_matching_scenarios() \ No newline at end of file diff --git a/tests/test_config_selection.py b/tests/test_config_selection.py new file mode 100644 index 00000000..97245f9f --- /dev/null +++ b/tests/test_config_selection.py @@ -0,0 +1,87 @@ +""" +Test config selection logic in dispatchers +""" +import asyncio +import sys +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from crawl4ai.async_configs import CrawlerRunConfig, MatchMode +from crawl4ai.async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher + +class TestDispatcher(BaseDispatcher): + """Simple test dispatcher to verify config selection""" + + async def crawl_url(self, url, config, task_id, **kwargs): + # Just return which config was selected + selected = self.select_config(url, config) + return {"url": url, "config_id": id(selected)} + + async def run_urls(self, urls, crawler, config): + results = [] + for url in urls: + result = await self.crawl_url(url, config, "test") + results.append(result) + return results + +async def test_dispatcher_config_selection(): + print("Testing dispatcher config selection") + print("=" * 50) + + # Create test configs with different matchers + pdf_config = CrawlerRunConfig(url_matcher="*.pdf") + api_config = CrawlerRunConfig(url_matcher=lambda url: 'api' in url) + default_config = CrawlerRunConfig() # No matcher + + configs = [pdf_config, api_config, default_config] + + # Create test dispatcher + dispatcher = TestDispatcher() + + # Test single config + print("\nTest 1: Single config") + result = await dispatcher.crawl_url("https://example.com/file.pdf", pdf_config, "test1") + assert result["config_id"] == id(pdf_config) + print("βœ“ Single config works") + + # Test config list selection + print("\nTest 2: Config list selection") + test_cases = [ + ("https://example.com/file.pdf", id(pdf_config)), + ("https://api.example.com/data", id(api_config)), + ("https://example.com/page", id(configs[0])), # No match, uses first + ] + + for url, expected_id in test_cases: + result = await dispatcher.crawl_url(url, configs, "test") + assert result["config_id"] == expected_id, f"URL {url} got wrong config" + print(f"βœ“ {url} -> correct config selected") + + # Test with MemoryAdaptiveDispatcher + print("\nTest 3: MemoryAdaptiveDispatcher config selection") + mem_dispatcher = MemoryAdaptiveDispatcher() + + # Test select_config method directly + selected = mem_dispatcher.select_config("https://example.com/doc.pdf", configs) + assert selected == pdf_config + print("βœ“ MemoryAdaptiveDispatcher.select_config works") + + # Test empty config list + print("\nTest 4: Edge cases") + selected = mem_dispatcher.select_config("https://example.com", []) + assert isinstance(selected, CrawlerRunConfig) # Should return default + print("βœ“ Empty config list returns default config") + + # Test None config + selected = mem_dispatcher.select_config("https://example.com", None) + assert isinstance(selected, CrawlerRunConfig) # Should return default + print("βœ“ None config returns default config") + + print("\n" + "=" * 50) + print("All dispatcher tests passed! βœ“") + +if __name__ == "__main__": + asyncio.run(test_dispatcher_config_selection()) \ No newline at end of file diff --git a/tests/test_multi_config.py b/tests/test_multi_config.py new file mode 100644 index 00000000..ff8a6b17 --- /dev/null +++ b/tests/test_multi_config.py @@ -0,0 +1,117 @@ +""" +Test example for multiple crawler configs feature +""" +import asyncio +import sys +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode, CacheMode + +async def test_multi_config(): + # Create different configs for different URL patterns + + # Config for PDF files + pdf_config = CrawlerRunConfig( + url_matcher="*.pdf", + ) + + # Config for articles (using multiple patterns with OR logic) + article_config = CrawlerRunConfig( + url_matcher=["*/news/*", "*blog*", "*/article/*"], + match_mode=MatchMode.OR, + screenshot=True, + ) + + # Config using custom matcher function + api_config = CrawlerRunConfig( + url_matcher=lambda url: 'api' in url or 'json' in url, + ) + + # Config combining patterns and functions with AND logic + secure_docs_config = CrawlerRunConfig( + url_matcher=[ + "*.doc*", # Matches .doc, .docx + lambda url: url.startswith('https://') # Must be HTTPS + ], + match_mode=MatchMode.AND, + ) + + # Default config (no url_matcher means it won't match anything unless it's the fallback) + default_config = CrawlerRunConfig( + # cache_mode=CacheMode.BYPASS, + ) + + # List of configs - order matters! First match wins + configs = [ + pdf_config, + article_config, + api_config, + secure_docs_config, + default_config # Fallback + ] + + # Test URLs - using real URLs that exist + test_urls = [ + # "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF + # "https://www.bbc.com/news/articles/c5y3e3glnldo", # News article + # "https://blog.python.org/", # Blog URL + # "https://api.github.com/users/github", # GitHub API (returns JSON) + # "https://httpbin.org/json", # API endpoint that returns JSON + # "https://www.python.org/", # Generic HTTPS page + # "http://info.cern.ch/", # HTTP (not HTTPS) page + "https://example.com/", # β†’ Default config + ] + + # Test the matching logic + print("Config matching test:") + print("-" * 50) + for url in test_urls: + for i, config in enumerate(configs): + if config.is_match(url): + print(f"{url} -> Config {i} matches") + break + else: + print(f"{url} -> No match, will use fallback (first config)") + + print("\n" + "=" * 50 + "\n") + + # Now test with actual crawler + async with AsyncWebCrawler() as crawler: + # Single config - traditional usage still works + print("Test 1: Single config (backwards compatible)") + result = await crawler.arun_many( + urls=["https://www.python.org/"], + config=default_config + ) + print(f"Crawled {len(result)} URLs with single config\n") + + # Multiple configs - new feature + print("Test 2: Multiple configs") + # Just test with 2 URLs to avoid timeout + results = await crawler.arun_many( + urls=test_urls[:2], # Just test first 2 URLs + config=configs # Pass list of configs + ) + print(f"Crawled {len(results)} URLs with multiple configs") + + # Using custom matcher inline + print("\nTest 3: Inline custom matcher") + custom_config = CrawlerRunConfig( + url_matcher=lambda url: len(url) > 50 and 'python' in url.lower(), + verbose=False + ) + results = await crawler.arun_many( + urls=[ + "https://docs.python.org/3/library/asyncio.html", # Long URL with 'python' + "https://python.org/", # Short URL with 'python' - won't match + "https://www.google.com/" # No 'python' - won't match + ], + config=[custom_config, default_config] + ) + print(f"Crawled {len(results)} URLs with custom matcher") + +if __name__ == "__main__": + asyncio.run(test_multi_config()) \ No newline at end of file