# Examples Outline for crawl4ai - vibe Component **Target Document Type:** Examples Collection **Target Output Filename Suggestion:** `llm_examples_vibe.md` **Library Version Context:** 0.6.3 **Outline Generation Date:** 2024-05-24 --- This document provides a collection of runnable code examples for the `vibe` component of the `crawl4ai` library, focusing on its deep crawling capabilities, filtering, and scoring mechanisms. **Note on URLs:** Most examples use placeholder URLs like `https://docs.crawl4ai.com/vibe-examples/pageN.html`. These are for demonstration and will be mocked to return predefined content. Replace them with actual URLs for real-world use. **Common Imports (assumed for many examples below, but will be included in each runnable block):** ```python import asyncio import time import re from pathlib import Path import os # For local file examples from crawl4ai import ( AsyncWebCrawler, CrawlerRunConfig, CrawlResult, BrowserConfig, CacheMode, # Deep Crawling Strategies BFSDeePCrawlStrategy, DFSDeePCrawlStrategy, BestFirstCrawlingStrategy, DeepCrawlStrategy, # For custom strategy # Filters FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, URLFilter, ContentRelevanceFilter, # Conceptual SEOFilter, # Conceptual FilterStats, # Scorers URLScorer, # For custom scorer KeywordRelevanceScorer, PathDepthScorer, ContentTypeScorer, DomainAuthorityScorer, # Conceptual FreshnessScorer, # Conceptual CompositeScorer, # Other LLMExtractionStrategy, # For combination example AsyncLogger # For custom logger example ) from unittest.mock import patch, AsyncMock # For mocking network calls # --- Mock Website Data --- # This data will be used by the MockAsyncWebCrawler to simulate a website MOCK_SITE_DATA = { "https://docs.crawl4ai.com/vibe-examples/index.html": { "html_content": """
This is page 1. It has some core content about crawl strategies.
Sub Page 1.1 Sub Page 1.2 (PDF) Back to Index """, "response_headers": {"Content-Type": "text/html"} }, "https://docs.crawl4ai.com/vibe-examples/page1_sub1.html": { "html_content": "Sub page 1.1 content. More on core concepts.
", "response_headers": {"Content-Type": "text/html"} }, "https://docs.crawl4ai.com/vibe-examples/page1_sub2.pdf": { "html_content": "%PDF-1.4 ... (Mock PDF Content: Crawl examples)", # Mock PDF content "response_headers": {"Content-Type": "application/pdf"} }, "https://docs.crawl4ai.com/vibe-examples/page2.html": { "html_content": """This page discusses a key feature and advanced configuration for async tasks.
Sub Page 2.1 """, "response_headers": {"Content-Type": "text/html"} }, "https://docs.crawl4ai.com/vibe-examples/page2_sub1.html": { "html_content": "More about the feature and JavaScript interaction.
", "response_headers": {"Content-Type": "text/html"} }, "https://docs.crawl4ai.com/vibe-examples/archive/old_page.html": { "html_content": "Archived content, less relevant.
", "response_headers": {"Content-Type": "text/html"} }, "https://docs.crawl4ai.com/vibe-examples/blog/post1.html": { "html_content": "This is a blog post about core ideas and examples.
", "response_headers": {"Content-Type": "text/html"} }, "https://docs.crawl4ai.com/vibe-examples/login.html": { "html_content": "Content loaded by JavaScript.
", "response_headers": {"Content-Type": "text/html"} }, "https://external-site.com/pageA.html": { "html_content": "Content from external site about other topics.
", "response_headers": {"Content-Type": "text/html"} }, # For local file examples "file:" + str(Path(os.getcwd()) / "test_local_index.html"): { "html_content": """Local page 1 content.
", "response_headers": {"Content-Type": "text/html"} } } # Create a dummy local file for testing Path("test_local_index.html").write_text(MOCK_SITE_DATA["file:" + str(Path(os.getcwd()) / "test_local_index.html")]["html_content"]) Path("test_local_page1.html").write_text(MOCK_SITE_DATA["file:" + str(Path(os.getcwd()) / "test_local_page1.html")]["html_content"]) # --- Mock AsyncWebCrawler --- # This mock crawler will simulate fetching pages from MOCK_SITE_DATA class MockAsyncWebCrawler(AsyncWebCrawler): async def _fetch_page(self, url: str, config: CrawlerRunConfig): # Simulate network delay await asyncio.sleep(0.01) # Normalize URL for lookup (e.g. relative to absolute) if not url.startswith("file:") and not url.startswith("http"): # This is a simplified relative URL resolver for the mock base_parts = self.current_url.split('/')[:-1] if hasattr(self, 'current_url') and self.current_url else [] normalized_url = "/".join(base_parts + [url]) if "docs.crawl4ai.com" not in normalized_url and not normalized_url.startswith("file:"): # ensure base domain normalized_url = "https://docs.crawl4ai.com/vibe-examples/" + url.lstrip("/") else: normalized_url = url if normalized_url in MOCK_SITE_DATA: page_data = MOCK_SITE_DATA[normalized_url] self.current_url = normalized_url # Store for relative path resolution # Basic link extraction for deep crawling from bs4 import BeautifulSoup soup = BeautifulSoup(page_data["html_content"], 'html.parser') links = [] for a_tag in soup.find_all('a', href=True): href = a_tag['href'] # Simple relative to absolute conversion for mock if not href.startswith("http") and not href.startswith("file:") and not href.startswith("javascript:"): abs_href = "/".join(normalized_url.split('/')[:-1]) + "/" + href.lstrip("./") # Further simplify to ensure it hits mock data, very basic if "docs.crawl4ai.com" in abs_href: # if it's a vibe-example page abs_href = "https://docs.crawl4ai.com/vibe-examples/" + Path(href).name elif "external-site.com" in abs_href: abs_href = "https://external-site.com/" + Path(href).name elif href.startswith("file:"): # Keep file URLs as is abs_href = href elif href.startswith("javascript:"): abs_href = None # Skip JS links for this mock else: abs_href = href if abs_href: links.append({"href": abs_href, "text": a_tag.get_text(strip=True)}) return CrawlResult( url=normalized_url, html_content=page_data["html_content"], success=True, status_code=200, response_headers=page_data.get("response_headers", {"Content-Type": "text/html"}), links={"internal": [l for l in links if "docs.crawl4ai.com/vibe-examples" in l["href"] or l["href"].startswith("file:")], "external": [l for l in links if "external-site.com" in l["href"]]} ) else: # print(f"Mock Warning: URL not found in MOCK_SITE_DATA: {normalized_url} (Original: {url})") return CrawlResult( url=url, html_content="", success=False, status_code=404, error_message="Mock URL not found" ) async def arun(self, url: str, config: CrawlerRunConfig = None, **kwargs): # This is the method called by DeepCrawlStrategy instances # For deep crawls, the strategy itself calls this multiple times. # For a single arun call with a deep_crawl_strategy, the decorator handles it. if config and config.deep_crawl_strategy: # The decorator usually handles this part. For direct strategy.arun() tests: return await config.deep_crawl_strategy.arun( crawler=self, # Pass the mock crawler instance start_url=url, config=config ) # Fallback to single page fetch if no deep crawl strategy self.current_url = url # Set for relative path resolution in _fetch_page return await self._fetch_page(url, config) async def arun_many(self, urls: list[str], config: CrawlerRunConfig = None, **kwargs): results = [] for url_item in urls: # In BestFirst, arun_many is called with tuples of (score, depth, url, parent_url) # For simplicity in mock, we assume url_item is just the URL string here or a tuple where url is at index 2 current_url_to_crawl = url_item if isinstance(url_item, tuple) and len(url_item) >=3 : current_url_to_crawl = url_item[2] self.current_url = current_url_to_crawl # Set for relative path resolution result = await self._fetch_page(current_url_to_crawl, config) results.append(result) if config and config.stream: async def result_generator(): for res in results: yield res return result_generator() return results async def __aenter__(self): # print("MockAsyncWebCrawler entered") return self async def __aexit__(self, exc_type, exc_val, exc_tb): # print("MockAsyncWebCrawler exited") pass async def start(self): # Add start method # print("MockAsyncWebCrawler started") self.ready = True return self async def close(self): # Add close method # print("MockAsyncWebCrawler closed") self.ready = False # --- End Mock --- ``` --- ## 1. Introduction to Deep Crawling (`vibe`) The `vibe` component of Crawl4ai provides powerful deep crawling capabilities, allowing you to traverse websites by following links and processing multiple pages. ### 1.1. Example: Enabling Basic Deep Crawl with `BFSDeePCrawlStrategy` via `CrawlerRunConfig`. This example demonstrates how to enable a basic Breadth-First Search (BFS) deep crawl by setting the `deep_crawl_strategy` in `CrawlerRunConfig`. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy from unittest.mock import patch # Using the MockAsyncWebCrawler defined in the preamble @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def basic_bfs_deep_crawl(): # Configure BFS to crawl up to 1 level deep from the start URL bfs_strategy = BFSDeePCrawlStrategy(max_depth=1) run_config = CrawlerRunConfig( deep_crawl_strategy=bfs_strategy, # For mock, ensure cache is bypassed to see fresh mock results cache_mode=CacheMode.BYPASS ) # The actual AsyncWebCrawler is replaced by MockAsyncWebCrawler via @patch async with AsyncWebCrawler() as crawler: # This will be MockAsyncWebCrawler start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- Basic BFS Deep Crawl (max_depth=1) ---") print(f"Crawled {len(results)} pages starting from {start_url}:") for i, result in enumerate(results): if result.success: print(f" {i+1}. URL: {result.url}, Depth: {result.metadata.get('depth')}, Parent: {result.metadata.get('parent_url')}") else: print(f" {i+1}. FAILED: {result.url}, Error: {result.error_message}") if __name__ == "__main__": asyncio.run(basic_bfs_deep_crawl()) ``` ### 1.2. Example: Understanding `CrawlResult.metadata` (depth, parent_url, score) in Deep Crawl Results. Each `CrawlResult` from a deep crawl contains useful metadata like the crawl `depth`, the `parent_url` from which it was discovered, and a `score` (if applicable, e.g., with `BestFirstCrawlingStrategy`). ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, KeywordRelevanceScorer, BestFirstCrawlingStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def understand_metadata(): # Using BestFirstCrawlingStrategy to demonstrate scores scorer = KeywordRelevanceScorer(keywords=["feature", "core"]) strategy = BestFirstCrawlingStrategy(max_depth=1, url_scorer=scorer) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- Understanding CrawlResult.metadata ---") for result in results: if result.success: depth = result.metadata.get('depth', 'N/A') parent = result.metadata.get('parent_url', 'N/A') score = result.metadata.get('score', 'N/A') # Score comes from BestFirst strategy print(f"URL: {result.url}") print(f" Depth: {depth}") print(f" Parent URL: {parent}") print(f" Score: {score if score != 'N/A' else 'N/A (not scored or BFS/DFS)'}") print("-" * 20) if __name__ == "__main__": asyncio.run(understand_metadata()) ``` ### 1.3. Example: Minimal setup for deep crawling a single level deep. This demonstrates the most straightforward way to perform a shallow deep crawl (depth 1). ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def minimal_single_level_deep_crawl(): # BFS strategy, max_depth=1 means start_url + its direct links strategy = BFSDeePCrawlStrategy(max_depth=1) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- Minimal Single Level Deep Crawl (max_depth=1) ---") print(f"Total pages crawled: {len(results)}") for result in results: if result.success: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") if __name__ == "__main__": asyncio.run(minimal_single_level_deep_crawl()) ``` --- ## 2. Breadth-First Search (`BFSDeePCrawlStrategy`) Examples `BFSDeePCrawlStrategy` explores the website level by level. ### 2.1. Example: Basic `BFSDeePCrawlStrategy` with default depth. The default `max_depth` for `BFSDeePCrawlStrategy` is often 1 if not specified, meaning it crawls the start URL and its direct links. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def bfs_default_depth(): # Default max_depth is typically 1 (start_url + its direct children) # but let's be explicit for clarity or test with a higher default if library changes strategy = BFSDeePCrawlStrategy() # Default max_depth is 1 run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BFS with Default Depth (max_depth=1) ---") print(f"Crawled {len(results)} pages.") for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") if __name__ == "__main__": asyncio.run(bfs_default_depth()) ``` ### 2.2. Example: `BFSDeePCrawlStrategy` - Setting `max_depth` to control crawl depth (e.g., 3 levels). Control how many levels deep the BFS crawler will go from the start URL. `max_depth=0` means only the start URL. `max_depth=1` means start URL + its direct links. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def bfs_set_max_depth(): strategy = BFSDeePCrawlStrategy(max_depth=2) # Start URL (0), its links (1), and their links (2) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BFS with max_depth=2 ---") print(f"Crawled {len(results)} pages.") for result in sorted(results, key=lambda r: (r.metadata.get('depth', 0), r.url)): print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") # Verify that no pages with depth > 2 are present assert all(r.metadata.get('depth', 0) <= 2 for r in results if r.success) if __name__ == "__main__": asyncio.run(bfs_set_max_depth()) ``` ### 2.3. Example: `BFSDeePCrawlStrategy` - Setting `max_pages` to limit the total number of pages crawled (e.g., 10 pages). Limit the crawl to a maximum number of pages, regardless of depth. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy from unittest.mock import patch import math # for math.inf @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def bfs_set_max_pages(): strategy = BFSDeePCrawlStrategy( max_depth=math.inf, # Effectively no depth limit for this test max_pages=3 # Limit to 3 pages ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BFS with max_pages=3 ---") print(f"Crawled {len(results)} pages (should be at most 3).") for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") assert len(results) <= 3 if __name__ == "__main__": asyncio.run(bfs_set_max_pages()) ``` ### 2.4. Example: `BFSDeePCrawlStrategy` - Using `include_external=True` to follow links to external domains. Allow the BFS crawler to follow links that lead to different domains than the start URL. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def bfs_include_external(): strategy = BFSDeePCrawlStrategy( max_depth=1, include_external=True ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BFS with include_external=True (max_depth=1) ---") print(f"Crawled {len(results)} pages.") found_external = False for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") if "external-site.com" in result.url: found_external = True assert found_external, "Expected to crawl an external link." if __name__ == "__main__": asyncio.run(bfs_include_external()) ``` ### 2.5. Example: `BFSDeePCrawlStrategy` - Using `include_external=False` (default) to stay within the starting domain. The default behavior is to only crawl links within the same domain as the start URL. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def bfs_exclude_external(): strategy = BFSDeePCrawlStrategy( max_depth=1, include_external=False # Default, but explicit for clarity ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BFS with include_external=False (max_depth=1) ---") print(f"Crawled {len(results)} pages.") found_external = False for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") if "external-site.com" in result.url: found_external = True assert not found_external, "Should not have crawled external links." if __name__ == "__main__": asyncio.run(bfs_exclude_external()) ``` ### 2.6. Example: `BFSDeePCrawlStrategy` - Streaming results using `CrawlerRunConfig(stream=True)`. Process results as they become available, useful for long crawls. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def bfs_streaming_results(): strategy = BFSDeePCrawlStrategy(max_depth=1) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, stream=True, # Enable streaming cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" print(f"--- BFS with Streaming Results (max_depth=1) ---") count = 0 async for result in await crawler.arun(url=start_url, config=run_config): count += 1 if result.success: print(f" Streamed Result {count}: {result.url}, Depth: {result.metadata.get('depth')}") else: print(f" Streamed FAILED Result {count}: {result.url}, Error: {result.error_message}") print(f"Total results streamed: {count}") if __name__ == "__main__": asyncio.run(bfs_streaming_results()) ``` ### 2.7. Example: `BFSDeePCrawlStrategy` - Batch results using `CrawlerRunConfig(stream=False)` (default). The default behavior is to return all results as a list after the crawl completes. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def bfs_batch_results(): strategy = BFSDeePCrawlStrategy(max_depth=1) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, stream=False, # Default, but explicit for clarity cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) # Returns a list print(f"--- BFS with Batch Results (max_depth=1) ---") print(f"Received {len(results)} pages in a batch.") for result in results: if result.success: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") if __name__ == "__main__": asyncio.run(bfs_batch_results()) ``` ### 2.8. Example: `BFSDeePCrawlStrategy` - Integrating a `FilterChain` with `URLPatternFilter` to crawl specific paths. Use filters to guide the crawler, for instance, to only explore URLs matching `/blog/*`. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def bfs_with_url_pattern_filter(): # Only crawl URLs containing '/blog/' url_filter = URLPatternFilter(patterns=["*/blog/*"]) filter_chain = FilterChain(filters=[url_filter]) strategy = BFSDeePCrawlStrategy( max_depth=1, filter_chain=filter_chain ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BFS with URLPatternFilter ('*/blog/*') ---") print(f"Crawled {len(results)} pages.") all_match_pattern = True for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") # The start URL itself might not match, but discovered links should if result.metadata.get('depth', 0) > 0 and "/blog/" not in result.url: all_match_pattern = False # The start_url itself is always processed, then its links are filtered. # So, we check if all *discovered* pages match the pattern. discovered_pages = [r for r in results if r.metadata.get('depth',0) > 0] if discovered_pages: # only assert if any pages beyond start_url were processed assert all("/blog/" in r.url for r in discovered_pages), "Not all crawled pages matched the /blog/ pattern" print("Filter applied successfully (start URL is always processed, subsequent links are filtered).") if __name__ == "__main__": asyncio.run(bfs_with_url_pattern_filter()) ``` ### 2.9. Example: `BFSDeePCrawlStrategy` - Demonstrating `shutdown()` to gracefully stop an ongoing crawl. Showcase how to stop a crawl prematurely using the strategy's `shutdown()` method. ```python import asyncio import time from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def bfs_demonstrate_shutdown(): strategy = BFSDeePCrawlStrategy( max_depth=5, # A potentially long crawl max_pages=100 ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, stream=True, # Streaming is good to see partial results before shutdown cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" # A site with enough links print(f"--- BFS with shutdown() demonstration ---") crawl_task = asyncio.create_task(crawler.arun(url=start_url, config=run_config)) # Let the crawl run for a very short time await asyncio.sleep(0.1) print("Attempting to shut down the crawl...") await strategy.shutdown() results_list = [] try: # Await the results from the crawl task # If streaming, this will iterate through what was processed before shutdown async for res in await crawl_task: results_list.append(res) print(f" Collected result (post-shutdown signal): {res.url}") except asyncio.CancelledError: print("Crawl task was cancelled.") print(f"Crawl shut down. Processed {len(results_list)} pages before/during shutdown.") # The number of pages will be less than if it ran to completion assert len(results_list) < 10, "Crawl likely didn't shut down early enough or mock site too small." if __name__ == "__main__": asyncio.run(bfs_demonstrate_shutdown()) ``` ### 2.10. Example: `BFSDeePCrawlStrategy` - Crawling with no `max_depth` limit but a `max_pages` limit. Demonstrate a scenario where depth is unlimited (or very high) but the crawl stops after a certain number of pages. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy from unittest.mock import patch import math @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def bfs_no_depth_limit_max_pages(): strategy = BFSDeePCrawlStrategy( max_depth=math.inf, # Unlimited depth max_pages=4 # But only 4 pages ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BFS with no depth limit, max_pages=4 ---") print(f"Crawled {len(results)} pages.") for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") assert len(results) <= 4, "More pages crawled than max_pages limit." if __name__ == "__main__": asyncio.run(bfs_no_depth_limit_max_pages()) ``` --- ## 3. Depth-First Search (`DFSDeePCrawlStrategy`) Examples `DFSDeePCrawlStrategy` explores as far down one branch as possible before backtracking. ### 3.1. Example: Basic `DFSDeePCrawlStrategy` with default depth. The default `max_depth` for `DFSDeePCrawlStrategy` is typically 10 if not specified. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def dfs_default_depth(): # Default max_depth for DFS is typically higher (e.g., 10) strategy = DFSDeePCrawlStrategy() run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, max_pages=5, # Limit pages to keep example short with default depth cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- DFS with Default Depth (max_pages=5 to limit output) ---") print(f"Crawled {len(results)} pages.") for result in results: # Order might be less predictable than BFS for small mock print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") if __name__ == "__main__": asyncio.run(dfs_default_depth()) ``` ### 3.2. Example: `DFSDeePCrawlStrategy` - Setting `max_depth` to control how deep each branch goes. Set `max_depth` to 2 for a DFS crawl. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def dfs_set_max_depth(): strategy = DFSDeePCrawlStrategy(max_depth=2) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- DFS with max_depth=2 ---") print(f"Crawled {len(results)} pages.") for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") assert all(r.metadata.get('depth', 0) <= 2 for r in results if r.success) if __name__ == "__main__": asyncio.run(dfs_set_max_depth()) ``` ### 3.3. Example: `DFSDeePCrawlStrategy` - Setting `max_pages` to limit the total number of pages. Limit the total number of pages crawled by DFS to 3. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy from unittest.mock import patch import math @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def dfs_set_max_pages(): strategy = DFSDeePCrawlStrategy( max_depth=math.inf, # No depth limit for this test max_pages=3 ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- DFS with max_pages=3 ---") print(f"Crawled {len(results)} pages (should be at most 3).") for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") assert len(results) <= 3 if __name__ == "__main__": asyncio.run(dfs_set_max_pages()) ``` ### 3.4. Example: `DFSDeePCrawlStrategy` - Following external links with `include_external=True`. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def dfs_include_external(): strategy = DFSDeePCrawlStrategy( max_depth=1, include_external=True, max_pages=5 # Limit pages as external can be vast ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- DFS with include_external=True (max_depth=1, max_pages=5) ---") print(f"Crawled {len(results)} pages.") found_external = False for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") if "external-site.com" in result.url: found_external = True assert found_external, "Expected to crawl an external link." if __name__ == "__main__": asyncio.run(dfs_include_external()) ``` ### 3.5. Example: `DFSDeePCrawlStrategy` - Staying within the domain with `include_external=False`. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def dfs_exclude_external(): strategy = DFSDeePCrawlStrategy( max_depth=1, include_external=False # Default ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- DFS with include_external=False (max_depth=1) ---") print(f"Crawled {len(results)} pages.") found_external = False for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") if "external-site.com" in result.url: found_external = True assert not found_external, "Should not have crawled external links." if __name__ == "__main__": asyncio.run(dfs_exclude_external()) ``` ### 3.6. Example: `DFSDeePCrawlStrategy` - Streaming results. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def dfs_streaming_results(): strategy = DFSDeePCrawlStrategy(max_depth=1) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, stream=True, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" print(f"--- DFS with Streaming Results (max_depth=1) ---") count = 0 async for result in await crawler.arun(url=start_url, config=run_config): count +=1 if result.success: print(f" Streamed Result {count}: {result.url}, Depth: {result.metadata.get('depth')}") print(f"Total results streamed: {count}") if __name__ == "__main__": asyncio.run(dfs_streaming_results()) ``` ### 3.7. Example: `DFSDeePCrawlStrategy` - Batch results. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def dfs_batch_results(): strategy = DFSDeePCrawlStrategy(max_depth=1) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, stream=False, # Default cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- DFS with Batch Results (max_depth=1) ---") print(f"Received {len(results)} pages in a batch.") for result in results: if result.success: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") if __name__ == "__main__": asyncio.run(dfs_batch_results()) ``` ### 3.8. Example: `DFSDeePCrawlStrategy` - Integrating a `FilterChain` with `DomainFilter` to restrict to subdomains. This example is conceptual for subdomains as MOCK_SITE_DATA doesn't have distinct subdomains. The filter setup is key. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy, FilterChain, DomainFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def dfs_with_domain_filter_subdomains(): # Allow only the start domain and its subdomains # For this mock, 'docs.crawl4ai.com' will be the main domain. # If we had e.g., 'blog.docs.crawl4ai.com', this filter would allow it. domain_filter = DomainFilter( allowed_domains=["docs.crawl4ai.com"], allow_subdomains=True ) filter_chain = FilterChain(filters=[domain_filter]) strategy = DFSDeePCrawlStrategy( max_depth=1, filter_chain=filter_chain, include_external=True # Necessary to even consider other (sub)domains ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- DFS with DomainFilter (allow subdomains of docs.crawl4ai.com) ---") print(f"Crawled {len(results)} pages.") for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}") # In a real scenario, you'd assert that only allowed domains/subdomains are present. # Our mock data doesn't have true subdomains to test this effectively. assert "docs.crawl4ai.com" in result.url or "external-site.com" not in result.url if __name__ == "__main__": asyncio.run(dfs_with_domain_filter_subdomains()) ``` --- ## 4. Best-First Crawling (`BestFirstCrawlingStrategy`) Examples `BestFirstCrawlingStrategy` uses a priority queue, guided by scorers, to decide which URLs to crawl next. ### 4.1. Example: Basic `BestFirstCrawlingStrategy` with default parameters. If no `url_scorer` is provided, it behaves somewhat like BFS but might have different internal queue management. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_default_params(): strategy = BestFirstCrawlingStrategy(max_depth=1) # Default scorer (often scores 0) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BestFirstCrawlingStrategy with default parameters (max_depth=1) ---") print(f"Crawled {len(results)} pages.") for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}, Score: {result.metadata.get('score', 0.0):.2f}") if __name__ == "__main__": asyncio.run(best_first_default_params()) ``` ### 4.2. Example: `BestFirstCrawlingStrategy` - Setting `max_depth` to limit crawl depth. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_max_depth(): strategy = BestFirstCrawlingStrategy(max_depth=2) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BestFirstCrawlingStrategy with max_depth=2 ---") print(f"Crawled {len(results)} pages.") for result in sorted(results, key=lambda r: (r.metadata.get('depth', 0), r.url)): print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}, Score: {result.metadata.get('score', 0.0):.2f}") assert all(r.metadata.get('depth', 0) <= 2 for r in results if r.success) if __name__ == "__main__": asyncio.run(best_first_max_depth()) ``` ### 4.3. Example: `BestFirstCrawlingStrategy` - Setting `max_pages` to limit total pages crawled. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy from unittest.mock import patch import math @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_max_pages(): strategy = BestFirstCrawlingStrategy( max_depth=math.inf, max_pages=3 ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BestFirstCrawlingStrategy with max_pages=3 ---") print(f"Crawled {len(results)} pages.") for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}, Score: {result.metadata.get('score', 0.0):.2f}") assert len(results) <= 3 if __name__ == "__main__": asyncio.run(best_first_max_pages()) ``` ### 4.4. Example: `BestFirstCrawlingStrategy` - Using `include_external=True`. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_include_external(): strategy = BestFirstCrawlingStrategy( max_depth=1, include_external=True, max_pages=5 # To keep it manageable ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BestFirstCrawlingStrategy with include_external=True (max_depth=1) ---") print(f"Crawled {len(results)} pages.") found_external = False for result in results: print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}, Score: {result.metadata.get('score', 0.0):.2f}") if "external-site.com" in result.url: found_external = True assert found_external, "Expected to crawl an external link." if __name__ == "__main__": asyncio.run(best_first_include_external()) ``` ### 4.5. Example: `BestFirstCrawlingStrategy` - Using `KeywordRelevanceScorer` to prioritize URLs containing specific keywords. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_keyword_scorer(): scorer = KeywordRelevanceScorer(keywords=["feature", "advanced", "core"]) strategy = BestFirstCrawlingStrategy( max_depth=1, url_scorer=scorer, max_pages=4 # Limit for example clarity ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True # Stream to see order ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" print(f"--- BestFirstCrawlingStrategy with KeywordRelevanceScorer ---") results_list = [] async for result in await crawler.arun(url=start_url, config=run_config): results_list.append(result) if result.success: print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f} (Depth: {result.metadata.get('depth')})") # Check if pages with keywords like "feature" or "core" were prioritized (appeared earlier/higher score) # This is a soft check as actual order depends on many factors in a real crawl # and the mock site's link structure. print("\nNote: Higher scores should ideally correspond to URLs with keywords 'feature', 'advanced', 'core'.") feature_page_crawled = any("page2.html" in r.url for r in results_list) # page2 has "feature" assert feature_page_crawled, "Page with 'feature' keyword was expected." if __name__ == "__main__": asyncio.run(best_first_keyword_scorer()) ``` ### 4.6. Example: `BestFirstCrawlingStrategy` - Using `PathDepthScorer` to influence priority based on URL path depth. This scorer penalizes deeper paths by default. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, PathDepthScorer from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_path_depth_scorer(): # Penalizes deeper paths (lower score for deeper paths) scorer = PathDepthScorer(higher_score_is_better=False) strategy = BestFirstCrawlingStrategy( max_depth=2, # Allow some depth to see scorer effect url_scorer=scorer ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" print(f"--- BestFirstCrawlingStrategy with PathDepthScorer (favoring shallower paths) ---") results_list = [] async for result in await crawler.arun(url=start_url, config=run_config): results_list.append(result) if result.success: print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Depth: {result.metadata.get('depth')}") # A simple check: depth 1 pages should generally have higher (less negative) scores than depth 2 # (if scores are negative due to penalty) or simply appear earlier if scores are positive. # With default scoring, higher score_is_better = True, so higher depth = lower score. # With higher_score_is_better=False, higher depth = higher (less negative) score. # The mock PathDepthScorer will need to be implemented or this test adjusted based on actual scorer logic. # For now, let's assume the scorer penalizes, so deeper paths have lower (more negative) scores. print("\nNote: Shallower pages should ideally have higher scores.") if __name__ == "__main__": asyncio.run(best_first_path_depth_scorer()) ``` ### 4.7. Example: `BestFirstCrawlingStrategy` - Using `ContentTypeScorer` to prioritize HTML pages over PDFs. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, ContentTypeScorer from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_content_type_scorer(): # Prioritize HTML, penalize PDF scorer = ContentTypeScorer(content_type_weights={"text/html": 1.0, "application/pdf": -0.5}) strategy = BestFirstCrawlingStrategy( max_depth=1, url_scorer=scorer ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # This page links to HTML and PDF print(f"--- BestFirstCrawlingStrategy with ContentTypeScorer (HTML > PDF) ---") results_list = [] async for result in await crawler.arun(url=start_url, config=run_config): results_list.append(result) if result.success: print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Content-Type: {result.response_headers.get('Content-Type')}") html_page_score = next((r.metadata.get('score') for r in results_list if "page1_sub1.html" in r.url), None) pdf_page_score = next((r.metadata.get('score') for r in results_list if "page1_sub2.pdf" in r.url), None) print(f"HTML page score: {html_page_score}, PDF page score: {pdf_page_score}") if html_page_score is not None and pdf_page_score is not None: assert html_page_score > pdf_page_score, "HTML page should have a higher score than PDF." elif html_page_score is None or pdf_page_score is None: print("Warning: Could not find both HTML and PDF pages in results to compare scores.") if __name__ == "__main__": asyncio.run(best_first_content_type_scorer()) ``` ### 4.8. Example: `BestFirstCrawlingStrategy` - Using `CompositeScorer` to combine `KeywordRelevanceScorer` and `PathDepthScorer`. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer, PathDepthScorer, CompositeScorer from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_composite_scorer(): keyword_scorer = KeywordRelevanceScorer(keywords=["feature", "core"], weight=0.7) path_scorer = PathDepthScorer(weight=0.3, higher_score_is_better=False) # Penalize depth slightly composite_scorer = CompositeScorer(scorers=[keyword_scorer, path_scorer]) strategy = BestFirstCrawlingStrategy( max_depth=2, url_scorer=composite_scorer, max_pages=6 ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" print(f"--- BestFirstCrawlingStrategy with CompositeScorer ---") async for result in await crawler.arun(url=start_url, config=run_config): if result.success: print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Depth: {result.metadata.get('depth')}") print("\nNote: Scores are a combination of keyword relevance and path depth penalty.") if __name__ == "__main__": asyncio.run(best_first_composite_scorer()) ``` ### 4.9. Example: `BestFirstCrawlingStrategy` - Integrating a `FilterChain` with `ContentTypeFilter` to only process HTML. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, FilterChain, ContentTypeFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_with_content_type_filter(): content_filter = ContentTypeFilter(allowed_types=["text/html"]) filter_chain = FilterChain(filters=[content_filter]) # Scorer is optional here, just demonstrating filter integration strategy = BestFirstCrawlingStrategy( max_depth=1, filter_chain=filter_chain ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # This page links to HTML and PDF results = await crawler.arun(url=start_url, config=run_config) print(f"--- BestFirstCrawlingStrategy with ContentTypeFilter (HTML only) ---") print(f"Crawled {len(results)} pages.") all_html = True for result in results: content_type = result.response_headers.get('Content-Type', '') print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}, Content-Type: {content_type}") if result.metadata.get('depth',0) > 0 and "text/html" not in content_type : # Start URL is not filtered all_html = False discovered_pages = [r for r in results if r.metadata.get('depth',0) > 0] if discovered_pages: assert all("text/html" in r.response_headers.get('Content-Type','') for r in discovered_pages), "Non-HTML page found among discovered pages." print("Filter for HTML content type applied successfully to discovered pages.") if __name__ == "__main__": asyncio.run(best_first_with_content_type_filter()) ``` ### 4.10. Example: `BestFirstCrawlingStrategy` - Streaming results and observing the order based on scores. This example will use a scorer and stream results to demonstrate that higher-scored URLs are (generally) processed earlier. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_streaming_order(): scorer = KeywordRelevanceScorer(keywords=["feature", "advanced"]) strategy = BestFirstCrawlingStrategy( max_depth=1, url_scorer=scorer, max_pages=5 ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, stream=True, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" print(f"--- BestFirstCrawlingStrategy - Streaming and Observing Order ---") previous_score = float('inf') # Assuming scores are positive and higher is better processed_urls = [] async for result in await crawler.arun(url=start_url, config=run_config): if result.success: current_score = result.metadata.get('score', 0.0) print(f" Streamed: {result.url}, Score: {current_score:.2f}, Depth: {result.metadata.get('depth')}") # Note: Due to batching (BATCH_SIZE) and async nature, strict descending order isn't guaranteed # but generally higher scored items should appear earlier. # assert current_score <= previous_score + 1e-9, f"Scores not in generally descending order: {previous_score} then {current_score}" # previous_score = current_score processed_urls.append((result.url, current_score)) print("\nProcessed URLs and their scores (order of processing):") for url, score in processed_urls: print(f" {url} (Score: {score:.2f})") print("Note: Higher scored URLs are prioritized but strict order depends on batching and concurrency.") if __name__ == "__main__": asyncio.run(best_first_streaming_order()) ``` ### 4.11. Example: `BestFirstCrawlingStrategy` - Batch results and analyzing scores post-crawl. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_batch_analysis(): scorer = KeywordRelevanceScorer(keywords=["feature", "core"]) strategy = BestFirstCrawlingStrategy( max_depth=1, url_scorer=scorer, max_pages=5 ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, stream=False, # Batch mode cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BestFirstCrawlingStrategy - Batch Results Analysis ---") print(f"Received {len(results)} pages.") # Sort by score for analysis (higher score first) sorted_results = sorted(results, key=lambda r: r.metadata.get('score', 0.0), reverse=True) for result in sorted_results: if result.success: print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Depth: {result.metadata.get('depth')}") if __name__ == "__main__": asyncio.run(best_first_batch_analysis()) ``` ### 4.12. Example: `BestFirstCrawlingStrategy` - Accessing and interpreting `score`, `depth`, and `parent_url` from `CrawlResult.metadata`. This explicitly shows how to get these specific metadata fields. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_access_metadata(): scorer = KeywordRelevanceScorer(keywords=["feature"]) strategy = BestFirstCrawlingStrategy(max_depth=1, url_scorer=scorer) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- BestFirstCrawlingStrategy - Accessing Metadata ---") for result in results: if result.success: url = result.url metadata = result.metadata depth = metadata.get('depth', 'N/A') parent_url = metadata.get('parent_url', 'N/A') score = metadata.get('score', 'N/A') print(f"URL: {url}") print(f" Depth: {depth}") print(f" Parent URL: {parent_url}") print(f" Score: {score:.2f}" if isinstance(score, float) else f" Score: {score}") print("-" * 10) if __name__ == "__main__": asyncio.run(best_first_access_metadata()) ``` ### 4.13. Example: `BestFirstCrawlingStrategy` - Demonstrating `shutdown()` to stop an ongoing prioritized crawl. ```python import asyncio import time from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_demonstrate_shutdown(): scorer = KeywordRelevanceScorer(keywords=["feature", "core", "example"]) strategy = BestFirstCrawlingStrategy( max_depth=5, # A potentially long crawl max_pages=100, url_scorer=scorer ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, stream=True, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" print(f"--- BestFirstCrawlingStrategy with shutdown() demonstration ---") crawl_task = asyncio.create_task(crawler.arun(url=start_url, config=run_config)) await asyncio.sleep(0.1) print("Attempting to shut down the BestFirst crawl...") await strategy.shutdown() results_list = [] try: async for res in await crawl_task: results_list.append(res) print(f" Collected result (post-shutdown signal): {res.url} (Score: {res.metadata.get('score', 0.0):.2f})") except asyncio.CancelledError: print("Crawl task was cancelled.") print(f"Crawl shut down. Processed {len(results_list)} pages before/during shutdown.") assert len(results_list) < 10, "Crawl likely didn't shut down early enough or mock site too small." if __name__ == "__main__": asyncio.run(best_first_demonstrate_shutdown()) ``` ### 4.14. Example: `BestFirstCrawlingStrategy` - Explaining the effect of `BATCH_SIZE` on `arun_many`. `BATCH_SIZE` is an internal constant in `bbf_strategy.py` (typically 10). This example explains its role rather than making it directly configurable by the user through the strategy's constructor, as it's an internal implementation detail of how the strategy uses `crawler.arun_many`. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer from unittest.mock import patch # Note: BATCH_SIZE is internal to BestFirstCrawlingStrategy, usually 10. # We can't directly set it, but we can explain its effect. @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def best_first_batch_size_effect(): print("--- Explaining BATCH_SIZE in BestFirstCrawlingStrategy ---") print("BestFirstCrawlingStrategy processes URLs in batches for efficiency.") print("Internally, it retrieves a batch of highest-priority URLs (typically up to BATCH_SIZE, e.g., 10) from its queue.") print("It then calls `crawler.arun_many()` with this batch.") print("This means that while URLs are prioritized, the order within a small batch might not be strictly descending by score,") print("especially if `stream=True`, as results from `arun_many` can arrive slightly out of strict submission order.") print("The overall crawl still heavily favors higher-scored URLs first over many batches.") # To simulate observing this, let's run a crawl and see if groups of results are processed. scorer = KeywordRelevanceScorer(keywords=["feature", "core", "page1", "page2"]) strategy = BestFirstCrawlingStrategy( max_depth=2, url_scorer=scorer, max_pages=6 # Small enough to potentially see batching effects if BATCH_SIZE was smaller ) run_config = CrawlerRunConfig( deep_crawl_strategy=strategy, stream=True, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" print("\n--- Crawl Example (max_pages=6) ---") results_in_order = [] async for result in await crawler.arun(url=start_url, config=run_config): if result.success: results_in_order.append(result.metadata.get('score',0.0)) print(f" Streamed: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}") # This assertion is hard to make definitively without knowing the exact internal BATCH_SIZE # and perfect mock site behavior. The print statements are more illustrative. print("\nScores in order of processing:", [f"{s:.2f}" for s in results_in_order]) print("Observe if there are small groups where order might not be strictly descending due to batch processing.") if __name__ == "__main__": asyncio.run(best_first_batch_size_effect()) ``` --- ## 5. Configuring Filters (`FilterChain`) for Deep Crawling Filters allow you to control which URLs are processed during a deep crawl. They are applied *before* a URL is added to the crawl queue (except for the start URL). ### 5.1. `URLPatternFilter` #### 5.1.1. Example: Using `URLPatternFilter` to allow URLs matching specific patterns (e.g., `/blog/*`). ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_allow_pattern(): # Allow only URLs containing '/blog/' url_filter = URLPatternFilter(patterns=["*/blog/*"]) filter_chain = FilterChain(filters=[url_filter]) strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- URLPatternFilter: Allowing '*/blog/*' ---") print(f"Crawled {len(results)} pages.") for r in results: print(f" URL: {r.url} (Depth: {r.metadata.get('depth')})") if r.metadata.get('depth', 0) > 0: # Check discovered URLs assert "/blog/" in r.url, f"Page {r.url} does not match pattern." print("All discovered pages match the allowed pattern.") if __name__ == "__main__": asyncio.run(filter_allow_pattern()) ``` #### 5.1.2. Example: Using `URLPatternFilter` to block URLs matching specific patterns (e.g., `*/login/*`, `*/archive/*`). ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_block_pattern(): # Block URLs containing '/login/' or '/archive/' url_filter = URLPatternFilter(patterns=["*/login/*", "*/archive/*"], block_list=True) filter_chain = FilterChain(filters=[url_filter]) strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- URLPatternFilter: Blocking '*/login/*' and '*/archive/*' ---") print(f"Crawled {len(results)} pages.") for r in results: print(f" URL: {r.url} (Depth: {r.metadata.get('depth')})") assert "/login/" not in r.url, f"Page {r.url} should have been blocked (login)." assert "/archive/" not in r.url, f"Page {r.url} should have been blocked (archive)." print("No pages matching blocked patterns were crawled.") if __name__ == "__main__": asyncio.run(filter_block_pattern()) ``` #### 5.1.3. Example: `URLPatternFilter` with `case_sensitive=True` vs. `case_sensitive=False`. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter from unittest.mock import patch # Add a case-specific URL to MOCK_SITE_DATA MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/Page1.html"] = { "html_content": "Content for case test.
", "response_headers": {"Content-Type": "text/html"} } MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"] += 'Page 1 Case Test' @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_pattern_case_sensitivity(): start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" # Case-sensitive: should only match 'page1.html' print("\n--- URLPatternFilter: Case Sensitive (Allow '*/page1.html*') ---") url_filter_sensitive = URLPatternFilter(patterns=["*/page1.html*"], case_sensitive=True) filter_chain_sensitive = FilterChain(filters=[url_filter_sensitive]) strategy_sensitive = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain_sensitive) run_config_sensitive = CrawlerRunConfig(deep_crawl_strategy=strategy_sensitive, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: results_sensitive = await crawler.arun(url=start_url, config=run_config_sensitive) print(f"Crawled {len(results_sensitive)} pages.") for r in results_sensitive: print(f" URL: {r.url}") if r.metadata.get('depth',0) > 0: assert "page1.html" in r.url and "Page1.html" not in r.url, "Case-sensitive filter failed." # Case-insensitive: should match both 'page1.html' and 'Page1.html' print("\n--- URLPatternFilter: Case Insensitive (Allow '*/page1.html*') ---") url_filter_insensitive = URLPatternFilter(patterns=["*/page1.html*"], case_sensitive=False) filter_chain_insensitive = FilterChain(filters=[url_filter_insensitive]) strategy_insensitive = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain_insensitive) run_config_insensitive = CrawlerRunConfig(deep_crawl_strategy=strategy_insensitive, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: results_insensitive = await crawler.arun(url=start_url, config=run_config_insensitive) print(f"Crawled {len(results_insensitive)} pages.") found_page1_lower = False found_page1_upper = False for r in results_insensitive: print(f" URL: {r.url}") if "page1.html" in r.url.lower(): # Check lower to catch both if "page1.html" == Path(r.url).name: found_page1_lower = True if "Page1.html" == Path(r.url).name: found_page1_upper = True assert found_page1_lower and found_page1_upper, "Case-insensitive filter should have matched both cases." if __name__ == "__main__": asyncio.run(filter_pattern_case_sensitivity()) ``` ### 5.2. `DomainFilter` #### 5.2.1. Example: Using `DomainFilter` with `allowed_domains` to restrict crawling to a list of specific domains. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, DomainFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_allowed_domains(): # Only crawl within 'docs.crawl4ai.com' domain_filter = DomainFilter(allowed_domains=["docs.crawl4ai.com"]) filter_chain = FilterChain(filters=[domain_filter]) # include_external needs to be True for DomainFilter to even consider other domains for blocking/allowing strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain, include_external=True) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" # This links to external-site.com results = await crawler.arun(url=start_url, config=run_config) print(f"--- DomainFilter: Allowing only 'docs.crawl4ai.com' ---") print(f"Crawled {len(results)} pages.") for r in results: print(f" URL: {r.url}") assert "docs.crawl4ai.com" in r.url, f"Page {r.url} is not from an allowed domain." print("All crawled pages are from 'docs.crawl4ai.com'.") if __name__ == "__main__": asyncio.run(filter_allowed_domains()) ``` #### 5.2.2. Example: Using `DomainFilter` with `blocked_domains` to avoid crawling certain domains. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, DomainFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_blocked_domains(): # Block 'external-site.com' domain_filter = DomainFilter(blocked_domains=["external-site.com"]) filter_chain = FilterChain(filters=[domain_filter]) strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain, include_external=True) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- DomainFilter: Blocking 'external-site.com' ---") print(f"Crawled {len(results)} pages.") for r in results: print(f" URL: {r.url}") assert "external-site.com" not in r.url, f"Page {r.url} from blocked domain was crawled." print("No pages from 'external-site.com' were crawled.") if __name__ == "__main__": asyncio.run(filter_blocked_domains()) ``` #### 5.2.3. Example: `DomainFilter` configured to allow subdomains (`allow_subdomains=True`). (Conceptual as MOCK_SITE_DATA doesn't have subdomains for `docs.crawl4ai.com`.) ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, DomainFilter from unittest.mock import patch # Imagine MOCK_SITE_DATA also has: # "https://blog.docs.crawl4ai.com/vibe-examples/post.html": { ... } # And index.html links to it. @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_allow_subdomains(): domain_filter = DomainFilter(allowed_domains=["docs.crawl4ai.com"], allow_subdomains=True) filter_chain = FilterChain(filters=[domain_filter]) strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain, include_external=True) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- DomainFilter: Allowing subdomains of 'docs.crawl4ai.com' (Conceptual) ---") print(f"Crawled {len(results)} pages.") for r in results: print(f" URL: {r.url}") # In a real test, you'd check if blog.docs.crawl4ai.com was included print("This example is conceptual; for a real test, ensure mock data includes subdomains.") if __name__ == "__main__": asyncio.run(filter_allow_subdomains()) ``` #### 5.2.4. Example: `DomainFilter` configured to disallow subdomains (`allow_subdomains=False`). (Conceptual as MOCK_SITE_DATA doesn't have subdomains for `docs.crawl4ai.com`.) ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, DomainFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_disallow_subdomains(): domain_filter = DomainFilter(allowed_domains=["docs.crawl4ai.com"], allow_subdomains=False) # Default filter_chain = FilterChain(filters=[domain_filter]) strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain, include_external=True) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- DomainFilter: Disallowing subdomains of 'docs.crawl4ai.com' (Conceptual) ---") print(f"Crawled {len(results)} pages.") for r in results: print(f" URL: {r.url}") # In a real test, you'd check if blog.docs.crawl4ai.com was NOT included print("This example is conceptual; for a real test, ensure mock data includes subdomains to be excluded.") if __name__ == "__main__": asyncio.run(filter_disallow_subdomains()) ``` ### 5.3. `ContentTypeFilter` #### 5.3.1. Example: Using `ContentTypeFilter` to allow only `text/html` pages. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, ContentTypeFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_allow_html_only(): content_filter = ContentTypeFilter(allowed_types=["text/html"]) filter_chain = FilterChain(filters=[content_filter]) strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # Links to HTML and PDF results = await crawler.arun(url=start_url, config=run_config) print(f"--- ContentTypeFilter: Allowing only 'text/html' ---") print(f"Crawled {len(results)} pages.") for r in results: content_type = r.response_headers.get('Content-Type', '') print(f" URL: {r.url}, Content-Type: {content_type}") if r.metadata.get('depth', 0) > 0: # Check discovered URLs assert "text/html" in content_type, f"Page {r.url} has wrong content type: {content_type}" print("All discovered pages are 'text/html'.") if __name__ == "__main__": asyncio.run(filter_allow_html_only()) ``` #### 5.3.2. Example: Using `ContentTypeFilter` with multiple `allowed_types` (e.g., `text/html`, `application/json`). (Conceptual, as MOCK_SITE_DATA only has html/pdf) ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, ContentTypeFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_allow_multiple_types(): content_filter = ContentTypeFilter(allowed_types=["text/html", "application/json"]) filter_chain = FilterChain(filters=[content_filter]) strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # Imagine page1.html also links to a page1_sub3.json MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1_sub3.json"] = { "html_content": '{"key": "value"}', "response_headers": {"Content-Type": "application/json"} } MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1.html"]["html_content"] += 'JSON Data' results = await crawler.arun(url=start_url, config=run_config) print(f"--- ContentTypeFilter: Allowing 'text/html', 'application/json' ---") print(f"Crawled {len(results)} pages.") found_json = False for r in results: content_type = r.response_headers.get('Content-Type', '') print(f" URL: {r.url}, Content-Type: {content_type}") if r.metadata.get('depth',0) > 0: assert "text/html" in content_type or "application/json" in content_type if "application/json" in content_type: found_json = True assert found_json, "Expected to find a JSON page." print("All discovered pages are either 'text/html' or 'application/json'.") # Clean up mock data del MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1_sub3.json"] MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1.html"]["html_content"] = MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1.html"]["html_content"].replace('JSON Data', '') if __name__ == "__main__": asyncio.run(filter_allow_multiple_types()) ``` #### 5.3.3. Example: Using `ContentTypeFilter` with `blocked_types` (e.g., blocking `application/pdf`). ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, ContentTypeFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_block_pdf(): content_filter = ContentTypeFilter(blocked_types=["application/pdf"]) filter_chain = FilterChain(filters=[content_filter]) strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # Links to HTML and PDF results = await crawler.arun(url=start_url, config=run_config) print(f"--- ContentTypeFilter: Blocking 'application/pdf' ---") print(f"Crawled {len(results)} pages.") for r in results: content_type = r.response_headers.get('Content-Type', '') print(f" URL: {r.url}, Content-Type: {content_type}") assert "application/pdf" not in content_type, f"PDF page {r.url} was not blocked." print("No 'application/pdf' pages were crawled (beyond start URL if it was PDF).") if __name__ == "__main__": asyncio.run(filter_block_pdf()) ``` ### 5.4. `URLFilter` (Simple exact match) #### 5.4.1. Example: `URLFilter` to allow a specific list of exact URLs. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_allow_exact_urls(): allowed_urls = [ "https://docs.crawl4ai.com/vibe-examples/page1.html", "https://docs.crawl4ai.com/vibe-examples/page1_sub1.html" ] url_filter = URLFilter(urls=allowed_urls, block_list=False) # Allow list filter_chain = FilterChain(filters=[url_filter]) strategy = BFSDeePCrawlStrategy(max_depth=2, filter_chain=filter_chain) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- URLFilter: Allowing specific URLs ---") print(f"Crawled {len(results)} pages.") crawled_urls = {r.url for r in results} # The start URL is always crawled initially, then its links are filtered. # So we check that all *other* crawled URLs are in the allowed list. for r_url in crawled_urls: if r_url != start_url: # Exclude start_url from this assertion assert r_url in allowed_urls, f"URL {r_url} was not in the allowed list." print("Only URLs from the allowed list (plus start_url) were crawled.") if __name__ == "__main__": asyncio.run(filter_allow_exact_urls()) ``` #### 5.4.2. Example: `URLFilter` to block a specific list of exact URLs. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLFilter from unittest.mock import patch @patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler) async def filter_block_exact_urls(): blocked_urls = [ "https://docs.crawl4ai.com/vibe-examples/page2.html", "https://docs.crawl4ai.com/vibe-examples/archive/old_page.html" ] url_filter = URLFilter(urls=blocked_urls, block_list=True) # Block list filter_chain = FilterChain(filters=[url_filter]) strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" results = await crawler.arun(url=start_url, config=run_config) print(f"--- URLFilter: Blocking specific URLs ---") print(f"Crawled {len(results)} pages.") crawled_urls = {r.url for r in results} for blocked_url in blocked_urls: assert blocked_url not in crawled_urls, f"URL {blocked_url} should have been blocked." print("Blocked URLs were not crawled.") if __name__ == "__main__": asyncio.run(filter_block_exact_urls()) ``` ### 5.5. `ContentRelevanceFilter` This filter uses an LLM to determine relevance. The example focuses on setup, as a full run requires an LLM. #### 5.5.1. Example: Setting up `ContentRelevanceFilter` with target keywords (conceptual, focusing on setup). ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, ContentRelevanceFilter, LLMConfig # This is a conceptual example showing setup. # A real run would require an LLM provider to be configured. async def setup_content_relevance_filter(): print("--- Setting up ContentRelevanceFilter (Conceptual) ---") # Define keywords and context for relevance keywords = ["artificial intelligence", "web crawling", "data extraction"] context_query = "Articles related to AI-powered web scraping tools and techniques." # Configure LLM (replace with your actual provider and API key) llm_config = LLMConfig(provider="openai/gpt-3.5-turbo", api_token="YOUR_OPENAI_API_KEY") relevance_filter = ContentRelevanceFilter( llm_config=llm_config, keywords=keywords, context_query=context_query, threshold=0.6 # Adjust threshold as needed ) filter_chain = FilterChain(filters=[relevance_filter]) strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain) run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS) print("ContentRelevanceFilter configured. To run this example:") print("1. Replace 'YOUR_OPENAI_API_KEY' with your actual OpenAI API key.") print("2. (Optional) Install OpenAI client: pip install openai") print("3. Uncomment the crawler execution part below.") # # Example of how it would be used (requires actual LLM call) # async with AsyncWebCrawler() as crawler: # # Mock or use a real URL that would trigger the LLM # start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # print(f"Attempting to crawl {start_url} with ContentRelevanceFilter...") # # results = await crawler.arun(url=start_url, config=run_config) # # print(f"Crawled {len(results)} pages after relevance filtering.") # # for r in results: # # print(f" URL: {r.url}, Relevance Score: {r.metadata.get('relevance_score')}") print("Conceptual setup complete.") if __name__ == "__main__": asyncio.run(setup_content_relevance_filter()) ``` #### 5.5.2. Example: `ContentRelevanceFilter` with a custom `threshold`. ```python import asyncio from crawl4ai import ContentRelevanceFilter, LLMConfig async def content_relevance_custom_threshold(): print("--- ContentRelevanceFilter with custom threshold (Conceptual Setup) ---") llm_config = LLMConfig(provider="openai/gpt-3.5-turbo", api_token="YOUR_OPENAI_API_KEY") # Replace # A higher threshold means stricter relevance checking strict_filter = ContentRelevanceFilter( llm_config=llm_config, keywords=["specific technical term"], threshold=0.8 ) print(f"Strict filter created with threshold: {strict_filter.threshold}") # A lower threshold is more lenient lenient_filter = ContentRelevanceFilter( llm_config=llm_config, keywords=["general topic"], threshold=0.4 ) print(f"Lenient filter created with threshold: {lenient_filter.threshold}") print("Note: Actual filtering behavior depends on LLM responses to content.") if __name__ == "__main__": asyncio.run(content_relevance_custom_threshold()) ``` ### 5.6. `SEOFilter` This filter checks for common SEO issues. The example is conceptual, focusing on setup. #### 5.6.1. Example: Basic `SEOFilter` with default SEO checks (conceptual, focusing on setup). ```python import asyncio from crawl4ai import SEOFilter async def setup_basic_seo_filter(): print("--- Basic SEOFilter with default checks (Conceptual Setup) ---") # Default checks might include missing title, short meta description, etc. seo_filter = SEOFilter() print(f"SEOFilter created with default settings:") print(f" Min Title Length: {seo_filter.min_title_length}") print(f" Max Title Length: {seo_filter.max_title_length}") print(f" Min Meta Description Length: {seo_filter.min_meta_description_length}") # ... and other default parameters print("This filter would be added to a FilterChain and used in a DeepCrawlStrategy.") print("It would then check each page against these SEO criteria.") if __name__ == "__main__": asyncio.run(setup_basic_seo_filter()) ``` #### 5.6.2. Example: `SEOFilter` configuring specific checks like `min_title_length`, `max_meta_description_length`, or `keyword_in_title_check` (conceptual). ```python import asyncio from crawl4ai import SEOFilter async def setup_custom_seo_filter(): print("--- SEOFilter with custom checks (Conceptual Setup) ---") custom_seo_filter = SEOFilter( min_title_length=20, max_meta_description_length=150, keyword_in_title_check=True, target_keywords_for_seo=["crawl4ai", "web scraping"] # if keyword_in_title_check is True ) print(f"Custom SEOFilter created with:") print(f" Min Title Length: {custom_seo_filter.min_title_length}") print(f" Max Meta Description Length: {custom_seo_filter.max_meta_description_length}") print(f" Keyword in Title Check: {custom_seo_filter.keyword_in_title_check}") print(f" Target SEO Keywords: {custom_seo_filter.target_keywords_for_seo}") print("This filter would apply these specific criteria during a crawl.") if __name__ == "__main__": asyncio.run(setup_custom_seo_filter()) ``` ### 5.7. `FilterChain` #### 5.7.1. Example: Combining `URLPatternFilter` (allow `/products/*`) and `DomainFilter` (only `example.com`) in a `FilterChain`. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter, DomainFilter from unittest.mock import patch # Add mock data for this scenario MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/products/productA.html"] = { "html_content": "