3525 lines
154 KiB
Plaintext
3525 lines
154 KiB
Plaintext
# Examples Outline for crawl4ai - vibe Component
|
|
|
|
**Target Document Type:** Examples Collection
|
|
**Target Output Filename Suggestion:** `llm_examples_vibe.md`
|
|
**Library Version Context:** 0.6.3
|
|
**Outline Generation Date:** 2024-05-24
|
|
---
|
|
|
|
This document provides a collection of runnable code examples for the `vibe` component of the `crawl4ai` library, focusing on its deep crawling capabilities, filtering, and scoring mechanisms.
|
|
|
|
**Note on URLs:** Most examples use placeholder URLs like `https://docs.crawl4ai.com/vibe-examples/pageN.html`. These are for demonstration and will be mocked to return predefined content. Replace them with actual URLs for real-world use.
|
|
|
|
**Common Imports (assumed for many examples below, but will be included in each runnable block):**
|
|
```python
|
|
import asyncio
|
|
import time
|
|
import re
|
|
from pathlib import Path
|
|
import os # For local file examples
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
CrawlerRunConfig,
|
|
CrawlResult,
|
|
BrowserConfig,
|
|
CacheMode,
|
|
# Deep Crawling Strategies
|
|
BFSDeePCrawlStrategy,
|
|
DFSDeePCrawlStrategy,
|
|
BestFirstCrawlingStrategy,
|
|
DeepCrawlStrategy, # For custom strategy
|
|
# Filters
|
|
FilterChain,
|
|
URLPatternFilter,
|
|
DomainFilter,
|
|
ContentTypeFilter,
|
|
URLFilter,
|
|
ContentRelevanceFilter, # Conceptual
|
|
SEOFilter, # Conceptual
|
|
FilterStats,
|
|
# Scorers
|
|
URLScorer, # For custom scorer
|
|
KeywordRelevanceScorer,
|
|
PathDepthScorer,
|
|
ContentTypeScorer,
|
|
DomainAuthorityScorer, # Conceptual
|
|
FreshnessScorer, # Conceptual
|
|
CompositeScorer,
|
|
# Other
|
|
LLMExtractionStrategy, # For combination example
|
|
AsyncLogger # For custom logger example
|
|
)
|
|
from unittest.mock import patch, AsyncMock # For mocking network calls
|
|
|
|
# --- Mock Website Data ---
|
|
# This data will be used by the MockAsyncWebCrawler to simulate a website
|
|
MOCK_SITE_DATA = {
|
|
"https://docs.crawl4ai.com/vibe-examples/index.html": {
|
|
"html_content": """
|
|
<html><head><title>Index</title></head><body>
|
|
<h1>Main Page</h1>
|
|
<a href="page1.html">Page 1</a>
|
|
<a href="page2.html">Page 2 (Feature)</a>
|
|
<a href="https://external-site.com/pageA.html">External Site</a>
|
|
<a href="/vibe-examples/archive/old_page.html">Archive</a>
|
|
<a href="/vibe-examples/blog/post1.html">Blog Post 1</a>
|
|
<a href="/vibe-examples/login.html">Login</a>
|
|
<a href="javascript:void(0);" onclick="document.body.innerHTML += '<a href=js_page.html>JS Link</a>'">Load JS Link</a>
|
|
</body></html>
|
|
""",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
},
|
|
"https://docs.crawl4ai.com/vibe-examples/page1.html": {
|
|
"html_content": """
|
|
<html><head><title>Page 1</title></head><body>
|
|
<h2>Page One</h2>
|
|
<p>This is page 1. It has some core content about crawl strategies.</p>
|
|
<a href="page1_sub1.html">Sub Page 1.1</a>
|
|
<a href="page1_sub2.pdf">Sub Page 1.2 (PDF)</a>
|
|
<a href="index.html">Back to Index</a>
|
|
</body></html>
|
|
""",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
},
|
|
"https://docs.crawl4ai.com/vibe-examples/page1_sub1.html": {
|
|
"html_content": "<html><head><title>Sub Page 1.1</title></head><body><p>Sub page 1.1 content. More on core concepts.</p></body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
},
|
|
"https://docs.crawl4ai.com/vibe-examples/page1_sub2.pdf": {
|
|
"html_content": "%PDF-1.4 ... (Mock PDF Content: Crawl examples)", # Mock PDF content
|
|
"response_headers": {"Content-Type": "application/pdf"}
|
|
},
|
|
"https://docs.crawl4ai.com/vibe-examples/page2.html": {
|
|
"html_content": """
|
|
<html><head><title>Page 2 - Feature Rich</title></head><body>
|
|
<h2>Page Two with Feature</h2>
|
|
<p>This page discusses a key feature and advanced configuration for async tasks.</p>
|
|
<a href="page2_sub1.html">Sub Page 2.1</a>
|
|
</body></html>
|
|
""",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
},
|
|
"https://docs.crawl4ai.com/vibe-examples/page2_sub1.html": {
|
|
"html_content": "<html><head><title>Sub Page 2.1</title></head><body><p>More about the feature and JavaScript interaction.</p></body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
},
|
|
"https://docs.crawl4ai.com/vibe-examples/archive/old_page.html": {
|
|
"html_content": "<html><head><title>Old Page</title></head><body><p>Archived content, less relevant.</p></body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
},
|
|
"https://docs.crawl4ai.com/vibe-examples/blog/post1.html": {
|
|
"html_content": "<html><head><title>Blog Post 1</title></head><body><p>This is a blog post about core ideas and examples.</p></body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
},
|
|
"https://docs.crawl4ai.com/vibe-examples/login.html": {
|
|
"html_content": "<html><head><title>Login</title></head><body><form>...</form></body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
},
|
|
"https://docs.crawl4ai.com/vibe-examples/js_page.html": {
|
|
"html_content": "<html><head><title>JS Page</title></head><body><p>Content loaded by JavaScript.</p></body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
},
|
|
"https://external-site.com/pageA.html": {
|
|
"html_content": "<html><head><title>External Page A</title></head><body><p>Content from external site about other topics.</p></body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
},
|
|
# For local file examples
|
|
"file:" + str(Path(os.getcwd()) / "test_local_index.html"): {
|
|
"html_content": """
|
|
<html><head><title>Local Index</title></head><body>
|
|
<h1>Local Main Page</h1>
|
|
<a href="test_local_page1.html">Local Page 1</a>
|
|
<a href="https://docs.crawl4ai.com/vibe-examples/index.html">Web Index</a>
|
|
</body></html>
|
|
""",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
},
|
|
"file:" + str(Path(os.getcwd()) / "test_local_page1.html"): {
|
|
"html_content": "<html><head><title>Local Page 1</title></head><body><p>Local page 1 content.</p></body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
}
|
|
}
|
|
|
|
# Create a dummy local file for testing
|
|
Path("test_local_index.html").write_text(MOCK_SITE_DATA["file:" + str(Path(os.getcwd()) / "test_local_index.html")]["html_content"])
|
|
Path("test_local_page1.html").write_text(MOCK_SITE_DATA["file:" + str(Path(os.getcwd()) / "test_local_page1.html")]["html_content"])
|
|
|
|
|
|
# --- Mock AsyncWebCrawler ---
|
|
# This mock crawler will simulate fetching pages from MOCK_SITE_DATA
|
|
class MockAsyncWebCrawler(AsyncWebCrawler):
|
|
async def _fetch_page(self, url: str, config: CrawlerRunConfig):
|
|
# Simulate network delay
|
|
await asyncio.sleep(0.01)
|
|
|
|
# Normalize URL for lookup (e.g. relative to absolute)
|
|
if not url.startswith("file:") and not url.startswith("http"):
|
|
# This is a simplified relative URL resolver for the mock
|
|
base_parts = self.current_url.split('/')[:-1] if hasattr(self, 'current_url') and self.current_url else []
|
|
normalized_url = "/".join(base_parts + [url])
|
|
if "docs.crawl4ai.com" not in normalized_url and not normalized_url.startswith("file:"): # ensure base domain
|
|
normalized_url = "https://docs.crawl4ai.com/vibe-examples/" + url.lstrip("/")
|
|
else:
|
|
normalized_url = url
|
|
|
|
if normalized_url in MOCK_SITE_DATA:
|
|
page_data = MOCK_SITE_DATA[normalized_url]
|
|
self.current_url = normalized_url # Store for relative path resolution
|
|
|
|
# Basic link extraction for deep crawling
|
|
from bs4 import BeautifulSoup
|
|
soup = BeautifulSoup(page_data["html_content"], 'html.parser')
|
|
links = []
|
|
for a_tag in soup.find_all('a', href=True):
|
|
href = a_tag['href']
|
|
# Simple relative to absolute conversion for mock
|
|
if not href.startswith("http") and not href.startswith("file:") and not href.startswith("javascript:"):
|
|
abs_href = "/".join(normalized_url.split('/')[:-1]) + "/" + href.lstrip("./")
|
|
# Further simplify to ensure it hits mock data, very basic
|
|
if "docs.crawl4ai.com" in abs_href: # if it's a vibe-example page
|
|
abs_href = "https://docs.crawl4ai.com/vibe-examples/" + Path(href).name
|
|
elif "external-site.com" in abs_href:
|
|
abs_href = "https://external-site.com/" + Path(href).name
|
|
|
|
elif href.startswith("file:"): # Keep file URLs as is
|
|
abs_href = href
|
|
elif href.startswith("javascript:"):
|
|
abs_href = None # Skip JS links for this mock
|
|
else:
|
|
abs_href = href
|
|
|
|
if abs_href:
|
|
links.append({"href": abs_href, "text": a_tag.get_text(strip=True)})
|
|
|
|
return CrawlResult(
|
|
url=normalized_url,
|
|
html_content=page_data["html_content"],
|
|
success=True,
|
|
status_code=200,
|
|
response_headers=page_data.get("response_headers", {"Content-Type": "text/html"}),
|
|
links={"internal": [l for l in links if "docs.crawl4ai.com/vibe-examples" in l["href"] or l["href"].startswith("file:")],
|
|
"external": [l for l in links if "external-site.com" in l["href"]]}
|
|
)
|
|
else:
|
|
# print(f"Mock Warning: URL not found in MOCK_SITE_DATA: {normalized_url} (Original: {url})")
|
|
return CrawlResult(
|
|
url=url, html_content="", success=False, status_code=404, error_message="Mock URL not found"
|
|
)
|
|
|
|
async def arun(self, url: str, config: CrawlerRunConfig = None, **kwargs):
|
|
# This is the method called by DeepCrawlStrategy instances
|
|
# For deep crawls, the strategy itself calls this multiple times.
|
|
# For a single arun call with a deep_crawl_strategy, the decorator handles it.
|
|
|
|
if config and config.deep_crawl_strategy:
|
|
# The decorator usually handles this part. For direct strategy.arun() tests:
|
|
return await config.deep_crawl_strategy.arun(
|
|
crawler=self, # Pass the mock crawler instance
|
|
start_url=url,
|
|
config=config
|
|
)
|
|
|
|
# Fallback to single page fetch if no deep crawl strategy
|
|
self.current_url = url # Set for relative path resolution in _fetch_page
|
|
return await self._fetch_page(url, config)
|
|
|
|
async def arun_many(self, urls: list[str], config: CrawlerRunConfig = None, **kwargs):
|
|
results = []
|
|
for url_item in urls:
|
|
# In BestFirst, arun_many is called with tuples of (score, depth, url, parent_url)
|
|
# For simplicity in mock, we assume url_item is just the URL string here or a tuple where url is at index 2
|
|
current_url_to_crawl = url_item
|
|
if isinstance(url_item, tuple) and len(url_item) >=3 :
|
|
current_url_to_crawl = url_item[2]
|
|
|
|
self.current_url = current_url_to_crawl # Set for relative path resolution
|
|
result = await self._fetch_page(current_url_to_crawl, config)
|
|
results.append(result)
|
|
if config and config.stream:
|
|
async def result_generator():
|
|
for res in results:
|
|
yield res
|
|
return result_generator()
|
|
return results
|
|
|
|
async def __aenter__(self):
|
|
# print("MockAsyncWebCrawler entered")
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
# print("MockAsyncWebCrawler exited")
|
|
pass
|
|
|
|
async def start(self): # Add start method
|
|
# print("MockAsyncWebCrawler started")
|
|
self.ready = True
|
|
return self
|
|
|
|
async def close(self): # Add close method
|
|
# print("MockAsyncWebCrawler closed")
|
|
self.ready = False
|
|
|
|
# --- End Mock ---
|
|
```
|
|
|
|
---
|
|
## 1. Introduction to Deep Crawling (`vibe`)
|
|
|
|
The `vibe` component of Crawl4ai provides powerful deep crawling capabilities, allowing you to traverse websites by following links and processing multiple pages.
|
|
|
|
### 1.1. Example: Enabling Basic Deep Crawl with `BFSDeePCrawlStrategy` via `CrawlerRunConfig`.
|
|
This example demonstrates how to enable a basic Breadth-First Search (BFS) deep crawl by setting the `deep_crawl_strategy` in `CrawlerRunConfig`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
# Using the MockAsyncWebCrawler defined in the preamble
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def basic_bfs_deep_crawl():
|
|
# Configure BFS to crawl up to 1 level deep from the start URL
|
|
bfs_strategy = BFSDeePCrawlStrategy(max_depth=1)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=bfs_strategy,
|
|
# For mock, ensure cache is bypassed to see fresh mock results
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
# The actual AsyncWebCrawler is replaced by MockAsyncWebCrawler via @patch
|
|
async with AsyncWebCrawler() as crawler: # This will be MockAsyncWebCrawler
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- Basic BFS Deep Crawl (max_depth=1) ---")
|
|
print(f"Crawled {len(results)} pages starting from {start_url}:")
|
|
for i, result in enumerate(results):
|
|
if result.success:
|
|
print(f" {i+1}. URL: {result.url}, Depth: {result.metadata.get('depth')}, Parent: {result.metadata.get('parent_url')}")
|
|
else:
|
|
print(f" {i+1}. FAILED: {result.url}, Error: {result.error_message}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(basic_bfs_deep_crawl())
|
|
```
|
|
|
|
### 1.2. Example: Understanding `CrawlResult.metadata` (depth, parent_url, score) in Deep Crawl Results.
|
|
Each `CrawlResult` from a deep crawl contains useful metadata like the crawl `depth`, the `parent_url` from which it was discovered, and a `score` (if applicable, e.g., with `BestFirstCrawlingStrategy`).
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, KeywordRelevanceScorer, BestFirstCrawlingStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def understand_metadata():
|
|
# Using BestFirstCrawlingStrategy to demonstrate scores
|
|
scorer = KeywordRelevanceScorer(keywords=["feature", "core"])
|
|
strategy = BestFirstCrawlingStrategy(max_depth=1, url_scorer=scorer)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- Understanding CrawlResult.metadata ---")
|
|
for result in results:
|
|
if result.success:
|
|
depth = result.metadata.get('depth', 'N/A')
|
|
parent = result.metadata.get('parent_url', 'N/A')
|
|
score = result.metadata.get('score', 'N/A') # Score comes from BestFirst strategy
|
|
print(f"URL: {result.url}")
|
|
print(f" Depth: {depth}")
|
|
print(f" Parent URL: {parent}")
|
|
print(f" Score: {score if score != 'N/A' else 'N/A (not scored or BFS/DFS)'}")
|
|
print("-" * 20)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(understand_metadata())
|
|
```
|
|
|
|
### 1.3. Example: Minimal setup for deep crawling a single level deep.
|
|
This demonstrates the most straightforward way to perform a shallow deep crawl (depth 1).
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def minimal_single_level_deep_crawl():
|
|
# BFS strategy, max_depth=1 means start_url + its direct links
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1)
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- Minimal Single Level Deep Crawl (max_depth=1) ---")
|
|
print(f"Total pages crawled: {len(results)}")
|
|
for result in results:
|
|
if result.success:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(minimal_single_level_deep_crawl())
|
|
```
|
|
|
|
---
|
|
## 2. Breadth-First Search (`BFSDeePCrawlStrategy`) Examples
|
|
|
|
`BFSDeePCrawlStrategy` explores the website level by level.
|
|
|
|
### 2.1. Example: Basic `BFSDeePCrawlStrategy` with default depth.
|
|
The default `max_depth` for `BFSDeePCrawlStrategy` is often 1 if not specified, meaning it crawls the start URL and its direct links.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def bfs_default_depth():
|
|
# Default max_depth is typically 1 (start_url + its direct children)
|
|
# but let's be explicit for clarity or test with a higher default if library changes
|
|
strategy = BFSDeePCrawlStrategy() # Default max_depth is 1
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BFS with Default Depth (max_depth=1) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(bfs_default_depth())
|
|
```
|
|
|
|
### 2.2. Example: `BFSDeePCrawlStrategy` - Setting `max_depth` to control crawl depth (e.g., 3 levels).
|
|
Control how many levels deep the BFS crawler will go from the start URL. `max_depth=0` means only the start URL. `max_depth=1` means start URL + its direct links.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def bfs_set_max_depth():
|
|
strategy = BFSDeePCrawlStrategy(max_depth=2) # Start URL (0), its links (1), and their links (2)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BFS with max_depth=2 ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for result in sorted(results, key=lambda r: (r.metadata.get('depth', 0), r.url)):
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
|
|
# Verify that no pages with depth > 2 are present
|
|
assert all(r.metadata.get('depth', 0) <= 2 for r in results if r.success)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(bfs_set_max_depth())
|
|
```
|
|
|
|
### 2.3. Example: `BFSDeePCrawlStrategy` - Setting `max_pages` to limit the total number of pages crawled (e.g., 10 pages).
|
|
Limit the crawl to a maximum number of pages, regardless of depth.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
import math # for math.inf
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def bfs_set_max_pages():
|
|
strategy = BFSDeePCrawlStrategy(
|
|
max_depth=math.inf, # Effectively no depth limit for this test
|
|
max_pages=3 # Limit to 3 pages
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BFS with max_pages=3 ---")
|
|
print(f"Crawled {len(results)} pages (should be at most 3).")
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
|
|
assert len(results) <= 3
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(bfs_set_max_pages())
|
|
```
|
|
|
|
### 2.4. Example: `BFSDeePCrawlStrategy` - Using `include_external=True` to follow links to external domains.
|
|
Allow the BFS crawler to follow links that lead to different domains than the start URL.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def bfs_include_external():
|
|
strategy = BFSDeePCrawlStrategy(
|
|
max_depth=1,
|
|
include_external=True
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BFS with include_external=True (max_depth=1) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
found_external = False
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
if "external-site.com" in result.url:
|
|
found_external = True
|
|
|
|
assert found_external, "Expected to crawl an external link."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(bfs_include_external())
|
|
```
|
|
|
|
### 2.5. Example: `BFSDeePCrawlStrategy` - Using `include_external=False` (default) to stay within the starting domain.
|
|
The default behavior is to only crawl links within the same domain as the start URL.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def bfs_exclude_external():
|
|
strategy = BFSDeePCrawlStrategy(
|
|
max_depth=1,
|
|
include_external=False # Default, but explicit for clarity
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BFS with include_external=False (max_depth=1) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
found_external = False
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
if "external-site.com" in result.url:
|
|
found_external = True
|
|
|
|
assert not found_external, "Should not have crawled external links."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(bfs_exclude_external())
|
|
```
|
|
|
|
### 2.6. Example: `BFSDeePCrawlStrategy` - Streaming results using `CrawlerRunConfig(stream=True)`.
|
|
Process results as they become available, useful for long crawls.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def bfs_streaming_results():
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
stream=True, # Enable streaming
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
print(f"--- BFS with Streaming Results (max_depth=1) ---")
|
|
count = 0
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
count += 1
|
|
if result.success:
|
|
print(f" Streamed Result {count}: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
else:
|
|
print(f" Streamed FAILED Result {count}: {result.url}, Error: {result.error_message}")
|
|
print(f"Total results streamed: {count}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(bfs_streaming_results())
|
|
```
|
|
|
|
### 2.7. Example: `BFSDeePCrawlStrategy` - Batch results using `CrawlerRunConfig(stream=False)` (default).
|
|
The default behavior is to return all results as a list after the crawl completes.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def bfs_batch_results():
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
stream=False, # Default, but explicit for clarity
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config) # Returns a list
|
|
|
|
print(f"--- BFS with Batch Results (max_depth=1) ---")
|
|
print(f"Received {len(results)} pages in a batch.")
|
|
for result in results:
|
|
if result.success:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(bfs_batch_results())
|
|
```
|
|
|
|
### 2.8. Example: `BFSDeePCrawlStrategy` - Integrating a `FilterChain` with `URLPatternFilter` to crawl specific paths.
|
|
Use filters to guide the crawler, for instance, to only explore URLs matching `/blog/*`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def bfs_with_url_pattern_filter():
|
|
# Only crawl URLs containing '/blog/'
|
|
url_filter = URLPatternFilter(patterns=["*/blog/*"])
|
|
filter_chain = FilterChain(filters=[url_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(
|
|
max_depth=1,
|
|
filter_chain=filter_chain
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BFS with URLPatternFilter ('*/blog/*') ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
all_match_pattern = True
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
# The start URL itself might not match, but discovered links should
|
|
if result.metadata.get('depth', 0) > 0 and "/blog/" not in result.url:
|
|
all_match_pattern = False
|
|
|
|
# The start_url itself is always processed, then its links are filtered.
|
|
# So, we check if all *discovered* pages match the pattern.
|
|
discovered_pages = [r for r in results if r.metadata.get('depth',0) > 0]
|
|
if discovered_pages: # only assert if any pages beyond start_url were processed
|
|
assert all("/blog/" in r.url for r in discovered_pages), "Not all crawled pages matched the /blog/ pattern"
|
|
print("Filter applied successfully (start URL is always processed, subsequent links are filtered).")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(bfs_with_url_pattern_filter())
|
|
```
|
|
|
|
### 2.9. Example: `BFSDeePCrawlStrategy` - Demonstrating `shutdown()` to gracefully stop an ongoing crawl.
|
|
Showcase how to stop a crawl prematurely using the strategy's `shutdown()` method.
|
|
|
|
```python
|
|
import asyncio
|
|
import time
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def bfs_demonstrate_shutdown():
|
|
strategy = BFSDeePCrawlStrategy(
|
|
max_depth=5, # A potentially long crawl
|
|
max_pages=100
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
stream=True, # Streaming is good to see partial results before shutdown
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" # A site with enough links
|
|
|
|
print(f"--- BFS with shutdown() demonstration ---")
|
|
|
|
crawl_task = asyncio.create_task(crawler.arun(url=start_url, config=run_config))
|
|
|
|
# Let the crawl run for a very short time
|
|
await asyncio.sleep(0.1)
|
|
|
|
print("Attempting to shut down the crawl...")
|
|
await strategy.shutdown()
|
|
|
|
results_list = []
|
|
try:
|
|
# Await the results from the crawl task
|
|
# If streaming, this will iterate through what was processed before shutdown
|
|
async for res in await crawl_task:
|
|
results_list.append(res)
|
|
print(f" Collected result (post-shutdown signal): {res.url}")
|
|
except asyncio.CancelledError:
|
|
print("Crawl task was cancelled.")
|
|
|
|
print(f"Crawl shut down. Processed {len(results_list)} pages before/during shutdown.")
|
|
# The number of pages will be less than if it ran to completion
|
|
assert len(results_list) < 10, "Crawl likely didn't shut down early enough or mock site too small."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(bfs_demonstrate_shutdown())
|
|
```
|
|
|
|
### 2.10. Example: `BFSDeePCrawlStrategy` - Crawling with no `max_depth` limit but a `max_pages` limit.
|
|
Demonstrate a scenario where depth is unlimited (or very high) but the crawl stops after a certain number of pages.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
import math
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def bfs_no_depth_limit_max_pages():
|
|
strategy = BFSDeePCrawlStrategy(
|
|
max_depth=math.inf, # Unlimited depth
|
|
max_pages=4 # But only 4 pages
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BFS with no depth limit, max_pages=4 ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
|
|
assert len(results) <= 4, "More pages crawled than max_pages limit."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(bfs_no_depth_limit_max_pages())
|
|
```
|
|
|
|
---
|
|
## 3. Depth-First Search (`DFSDeePCrawlStrategy`) Examples
|
|
|
|
`DFSDeePCrawlStrategy` explores as far down one branch as possible before backtracking.
|
|
|
|
### 3.1. Example: Basic `DFSDeePCrawlStrategy` with default depth.
|
|
The default `max_depth` for `DFSDeePCrawlStrategy` is typically 10 if not specified.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def dfs_default_depth():
|
|
# Default max_depth for DFS is typically higher (e.g., 10)
|
|
strategy = DFSDeePCrawlStrategy()
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
max_pages=5, # Limit pages to keep example short with default depth
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- DFS with Default Depth (max_pages=5 to limit output) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for result in results: # Order might be less predictable than BFS for small mock
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(dfs_default_depth())
|
|
```
|
|
|
|
### 3.2. Example: `DFSDeePCrawlStrategy` - Setting `max_depth` to control how deep each branch goes.
|
|
Set `max_depth` to 2 for a DFS crawl.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def dfs_set_max_depth():
|
|
strategy = DFSDeePCrawlStrategy(max_depth=2)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- DFS with max_depth=2 ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
assert all(r.metadata.get('depth', 0) <= 2 for r in results if r.success)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(dfs_set_max_depth())
|
|
```
|
|
|
|
### 3.3. Example: `DFSDeePCrawlStrategy` - Setting `max_pages` to limit the total number of pages.
|
|
Limit the total number of pages crawled by DFS to 3.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
import math
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def dfs_set_max_pages():
|
|
strategy = DFSDeePCrawlStrategy(
|
|
max_depth=math.inf, # No depth limit for this test
|
|
max_pages=3
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- DFS with max_pages=3 ---")
|
|
print(f"Crawled {len(results)} pages (should be at most 3).")
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
assert len(results) <= 3
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(dfs_set_max_pages())
|
|
```
|
|
|
|
### 3.4. Example: `DFSDeePCrawlStrategy` - Following external links with `include_external=True`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def dfs_include_external():
|
|
strategy = DFSDeePCrawlStrategy(
|
|
max_depth=1,
|
|
include_external=True,
|
|
max_pages=5 # Limit pages as external can be vast
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- DFS with include_external=True (max_depth=1, max_pages=5) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
found_external = False
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
if "external-site.com" in result.url:
|
|
found_external = True
|
|
|
|
assert found_external, "Expected to crawl an external link."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(dfs_include_external())
|
|
```
|
|
|
|
### 3.5. Example: `DFSDeePCrawlStrategy` - Staying within the domain with `include_external=False`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def dfs_exclude_external():
|
|
strategy = DFSDeePCrawlStrategy(
|
|
max_depth=1,
|
|
include_external=False # Default
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- DFS with include_external=False (max_depth=1) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
found_external = False
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
if "external-site.com" in result.url:
|
|
found_external = True
|
|
|
|
assert not found_external, "Should not have crawled external links."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(dfs_exclude_external())
|
|
```
|
|
|
|
### 3.6. Example: `DFSDeePCrawlStrategy` - Streaming results.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def dfs_streaming_results():
|
|
strategy = DFSDeePCrawlStrategy(max_depth=1)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
stream=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
print(f"--- DFS with Streaming Results (max_depth=1) ---")
|
|
count = 0
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
count +=1
|
|
if result.success:
|
|
print(f" Streamed Result {count}: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
print(f"Total results streamed: {count}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(dfs_streaming_results())
|
|
```
|
|
|
|
### 3.7. Example: `DFSDeePCrawlStrategy` - Batch results.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def dfs_batch_results():
|
|
strategy = DFSDeePCrawlStrategy(max_depth=1)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
stream=False, # Default
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- DFS with Batch Results (max_depth=1) ---")
|
|
print(f"Received {len(results)} pages in a batch.")
|
|
for result in results:
|
|
if result.success:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(dfs_batch_results())
|
|
```
|
|
|
|
### 3.8. Example: `DFSDeePCrawlStrategy` - Integrating a `FilterChain` with `DomainFilter` to restrict to subdomains.
|
|
This example is conceptual for subdomains as MOCK_SITE_DATA doesn't have distinct subdomains. The filter setup is key.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DFSDeePCrawlStrategy, FilterChain, DomainFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def dfs_with_domain_filter_subdomains():
|
|
# Allow only the start domain and its subdomains
|
|
# For this mock, 'docs.crawl4ai.com' will be the main domain.
|
|
# If we had e.g., 'blog.docs.crawl4ai.com', this filter would allow it.
|
|
domain_filter = DomainFilter(
|
|
allowed_domains=["docs.crawl4ai.com"],
|
|
allow_subdomains=True
|
|
)
|
|
filter_chain = FilterChain(filters=[domain_filter])
|
|
|
|
strategy = DFSDeePCrawlStrategy(
|
|
max_depth=1,
|
|
filter_chain=filter_chain,
|
|
include_external=True # Necessary to even consider other (sub)domains
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- DFS with DomainFilter (allow subdomains of docs.crawl4ai.com) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
# In a real scenario, you'd assert that only allowed domains/subdomains are present.
|
|
# Our mock data doesn't have true subdomains to test this effectively.
|
|
assert "docs.crawl4ai.com" in result.url or "external-site.com" not in result.url
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(dfs_with_domain_filter_subdomains())
|
|
```
|
|
|
|
---
|
|
## 4. Best-First Crawling (`BestFirstCrawlingStrategy`) Examples
|
|
|
|
`BestFirstCrawlingStrategy` uses a priority queue, guided by scorers, to decide which URLs to crawl next.
|
|
|
|
### 4.1. Example: Basic `BestFirstCrawlingStrategy` with default parameters.
|
|
If no `url_scorer` is provided, it behaves somewhat like BFS but might have different internal queue management.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_default_params():
|
|
strategy = BestFirstCrawlingStrategy(max_depth=1) # Default scorer (often scores 0)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BestFirstCrawlingStrategy with default parameters (max_depth=1) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}, Score: {result.metadata.get('score', 0.0):.2f}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_default_params())
|
|
```
|
|
|
|
### 4.2. Example: `BestFirstCrawlingStrategy` - Setting `max_depth` to limit crawl depth.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_max_depth():
|
|
strategy = BestFirstCrawlingStrategy(max_depth=2)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BestFirstCrawlingStrategy with max_depth=2 ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for result in sorted(results, key=lambda r: (r.metadata.get('depth', 0), r.url)):
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}, Score: {result.metadata.get('score', 0.0):.2f}")
|
|
assert all(r.metadata.get('depth', 0) <= 2 for r in results if r.success)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_max_depth())
|
|
```
|
|
|
|
### 4.3. Example: `BestFirstCrawlingStrategy` - Setting `max_pages` to limit total pages crawled.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy
|
|
from unittest.mock import patch
|
|
import math
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_max_pages():
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=math.inf,
|
|
max_pages=3
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BestFirstCrawlingStrategy with max_pages=3 ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}, Score: {result.metadata.get('score', 0.0):.2f}")
|
|
assert len(results) <= 3
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_max_pages())
|
|
```
|
|
|
|
### 4.4. Example: `BestFirstCrawlingStrategy` - Using `include_external=True`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_include_external():
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=1,
|
|
include_external=True,
|
|
max_pages=5 # To keep it manageable
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BestFirstCrawlingStrategy with include_external=True (max_depth=1) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
found_external = False
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}, Score: {result.metadata.get('score', 0.0):.2f}")
|
|
if "external-site.com" in result.url:
|
|
found_external = True
|
|
|
|
assert found_external, "Expected to crawl an external link."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_include_external())
|
|
```
|
|
|
|
### 4.5. Example: `BestFirstCrawlingStrategy` - Using `KeywordRelevanceScorer` to prioritize URLs containing specific keywords.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_keyword_scorer():
|
|
scorer = KeywordRelevanceScorer(keywords=["feature", "advanced", "core"])
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=1,
|
|
url_scorer=scorer,
|
|
max_pages=4 # Limit for example clarity
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS,
|
|
stream=True # Stream to see order
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
print(f"--- BestFirstCrawlingStrategy with KeywordRelevanceScorer ---")
|
|
results_list = []
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
results_list.append(result)
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f} (Depth: {result.metadata.get('depth')})")
|
|
|
|
# Check if pages with keywords like "feature" or "core" were prioritized (appeared earlier/higher score)
|
|
# This is a soft check as actual order depends on many factors in a real crawl
|
|
# and the mock site's link structure.
|
|
print("\nNote: Higher scores should ideally correspond to URLs with keywords 'feature', 'advanced', 'core'.")
|
|
feature_page_crawled = any("page2.html" in r.url for r in results_list) # page2 has "feature"
|
|
assert feature_page_crawled, "Page with 'feature' keyword was expected."
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_keyword_scorer())
|
|
```
|
|
|
|
### 4.6. Example: `BestFirstCrawlingStrategy` - Using `PathDepthScorer` to influence priority based on URL path depth.
|
|
This scorer penalizes deeper paths by default.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, PathDepthScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_path_depth_scorer():
|
|
# Penalizes deeper paths (lower score for deeper paths)
|
|
scorer = PathDepthScorer(higher_score_is_better=False)
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=2, # Allow some depth to see scorer effect
|
|
url_scorer=scorer
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS,
|
|
stream=True
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
print(f"--- BestFirstCrawlingStrategy with PathDepthScorer (favoring shallower paths) ---")
|
|
|
|
results_list = []
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
results_list.append(result)
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Depth: {result.metadata.get('depth')}")
|
|
|
|
# A simple check: depth 1 pages should generally have higher (less negative) scores than depth 2
|
|
# (if scores are negative due to penalty) or simply appear earlier if scores are positive.
|
|
# With default scoring, higher score_is_better = True, so higher depth = lower score.
|
|
# With higher_score_is_better=False, higher depth = higher (less negative) score.
|
|
# The mock PathDepthScorer will need to be implemented or this test adjusted based on actual scorer logic.
|
|
# For now, let's assume the scorer penalizes, so deeper paths have lower (more negative) scores.
|
|
print("\nNote: Shallower pages should ideally have higher scores.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_path_depth_scorer())
|
|
```
|
|
|
|
### 4.7. Example: `BestFirstCrawlingStrategy` - Using `ContentTypeScorer` to prioritize HTML pages over PDFs.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, ContentTypeScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_content_type_scorer():
|
|
# Prioritize HTML, penalize PDF
|
|
scorer = ContentTypeScorer(content_type_weights={"text/html": 1.0, "application/pdf": -0.5})
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=1,
|
|
url_scorer=scorer
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS,
|
|
stream=True
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # This page links to HTML and PDF
|
|
print(f"--- BestFirstCrawlingStrategy with ContentTypeScorer (HTML > PDF) ---")
|
|
|
|
results_list = []
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
results_list.append(result)
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Content-Type: {result.response_headers.get('Content-Type')}")
|
|
|
|
html_page_score = next((r.metadata.get('score') for r in results_list if "page1_sub1.html" in r.url), None)
|
|
pdf_page_score = next((r.metadata.get('score') for r in results_list if "page1_sub2.pdf" in r.url), None)
|
|
|
|
print(f"HTML page score: {html_page_score}, PDF page score: {pdf_page_score}")
|
|
if html_page_score is not None and pdf_page_score is not None:
|
|
assert html_page_score > pdf_page_score, "HTML page should have a higher score than PDF."
|
|
elif html_page_score is None or pdf_page_score is None:
|
|
print("Warning: Could not find both HTML and PDF pages in results to compare scores.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_content_type_scorer())
|
|
```
|
|
|
|
### 4.8. Example: `BestFirstCrawlingStrategy` - Using `CompositeScorer` to combine `KeywordRelevanceScorer` and `PathDepthScorer`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer, PathDepthScorer, CompositeScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_composite_scorer():
|
|
keyword_scorer = KeywordRelevanceScorer(keywords=["feature", "core"], weight=0.7)
|
|
path_scorer = PathDepthScorer(weight=0.3, higher_score_is_better=False) # Penalize depth slightly
|
|
|
|
composite_scorer = CompositeScorer(scorers=[keyword_scorer, path_scorer])
|
|
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=2,
|
|
url_scorer=composite_scorer,
|
|
max_pages=6
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS,
|
|
stream=True
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
print(f"--- BestFirstCrawlingStrategy with CompositeScorer ---")
|
|
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Depth: {result.metadata.get('depth')}")
|
|
print("\nNote: Scores are a combination of keyword relevance and path depth penalty.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_composite_scorer())
|
|
```
|
|
|
|
### 4.9. Example: `BestFirstCrawlingStrategy` - Integrating a `FilterChain` with `ContentTypeFilter` to only process HTML.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, FilterChain, ContentTypeFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_with_content_type_filter():
|
|
content_filter = ContentTypeFilter(allowed_types=["text/html"])
|
|
filter_chain = FilterChain(filters=[content_filter])
|
|
|
|
# Scorer is optional here, just demonstrating filter integration
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=1,
|
|
filter_chain=filter_chain
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # This page links to HTML and PDF
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BestFirstCrawlingStrategy with ContentTypeFilter (HTML only) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
all_html = True
|
|
for result in results:
|
|
content_type = result.response_headers.get('Content-Type', '')
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}, Content-Type: {content_type}")
|
|
if result.metadata.get('depth',0) > 0 and "text/html" not in content_type : # Start URL is not filtered
|
|
all_html = False
|
|
|
|
discovered_pages = [r for r in results if r.metadata.get('depth',0) > 0]
|
|
if discovered_pages:
|
|
assert all("text/html" in r.response_headers.get('Content-Type','') for r in discovered_pages), "Non-HTML page found among discovered pages."
|
|
print("Filter for HTML content type applied successfully to discovered pages.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_with_content_type_filter())
|
|
```
|
|
|
|
### 4.10. Example: `BestFirstCrawlingStrategy` - Streaming results and observing the order based on scores.
|
|
This example will use a scorer and stream results to demonstrate that higher-scored URLs are (generally) processed earlier.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_streaming_order():
|
|
scorer = KeywordRelevanceScorer(keywords=["feature", "advanced"])
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=1,
|
|
url_scorer=scorer,
|
|
max_pages=5
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
stream=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
print(f"--- BestFirstCrawlingStrategy - Streaming and Observing Order ---")
|
|
|
|
previous_score = float('inf') # Assuming scores are positive and higher is better
|
|
processed_urls = []
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
if result.success:
|
|
current_score = result.metadata.get('score', 0.0)
|
|
print(f" Streamed: {result.url}, Score: {current_score:.2f}, Depth: {result.metadata.get('depth')}")
|
|
# Note: Due to batching (BATCH_SIZE) and async nature, strict descending order isn't guaranteed
|
|
# but generally higher scored items should appear earlier.
|
|
# assert current_score <= previous_score + 1e-9, f"Scores not in generally descending order: {previous_score} then {current_score}"
|
|
# previous_score = current_score
|
|
processed_urls.append((result.url, current_score))
|
|
|
|
print("\nProcessed URLs and their scores (order of processing):")
|
|
for url, score in processed_urls:
|
|
print(f" {url} (Score: {score:.2f})")
|
|
print("Note: Higher scored URLs are prioritized but strict order depends on batching and concurrency.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_streaming_order())
|
|
```
|
|
|
|
### 4.11. Example: `BestFirstCrawlingStrategy` - Batch results and analyzing scores post-crawl.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_batch_analysis():
|
|
scorer = KeywordRelevanceScorer(keywords=["feature", "core"])
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=1,
|
|
url_scorer=scorer,
|
|
max_pages=5
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
stream=False, # Batch mode
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BestFirstCrawlingStrategy - Batch Results Analysis ---")
|
|
print(f"Received {len(results)} pages.")
|
|
|
|
# Sort by score for analysis (higher score first)
|
|
sorted_results = sorted(results, key=lambda r: r.metadata.get('score', 0.0), reverse=True)
|
|
|
|
for result in sorted_results:
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Depth: {result.metadata.get('depth')}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_batch_analysis())
|
|
```
|
|
|
|
### 4.12. Example: `BestFirstCrawlingStrategy` - Accessing and interpreting `score`, `depth`, and `parent_url` from `CrawlResult.metadata`.
|
|
This explicitly shows how to get these specific metadata fields.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_access_metadata():
|
|
scorer = KeywordRelevanceScorer(keywords=["feature"])
|
|
strategy = BestFirstCrawlingStrategy(max_depth=1, url_scorer=scorer)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- BestFirstCrawlingStrategy - Accessing Metadata ---")
|
|
for result in results:
|
|
if result.success:
|
|
url = result.url
|
|
metadata = result.metadata
|
|
depth = metadata.get('depth', 'N/A')
|
|
parent_url = metadata.get('parent_url', 'N/A')
|
|
score = metadata.get('score', 'N/A')
|
|
|
|
print(f"URL: {url}")
|
|
print(f" Depth: {depth}")
|
|
print(f" Parent URL: {parent_url}")
|
|
print(f" Score: {score:.2f}" if isinstance(score, float) else f" Score: {score}")
|
|
print("-" * 10)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_access_metadata())
|
|
```
|
|
|
|
### 4.13. Example: `BestFirstCrawlingStrategy` - Demonstrating `shutdown()` to stop an ongoing prioritized crawl.
|
|
|
|
```python
|
|
import asyncio
|
|
import time
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_demonstrate_shutdown():
|
|
scorer = KeywordRelevanceScorer(keywords=["feature", "core", "example"])
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=5, # A potentially long crawl
|
|
max_pages=100,
|
|
url_scorer=scorer
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
stream=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
|
|
print(f"--- BestFirstCrawlingStrategy with shutdown() demonstration ---")
|
|
|
|
crawl_task = asyncio.create_task(crawler.arun(url=start_url, config=run_config))
|
|
|
|
await asyncio.sleep(0.1)
|
|
|
|
print("Attempting to shut down the BestFirst crawl...")
|
|
await strategy.shutdown()
|
|
|
|
results_list = []
|
|
try:
|
|
async for res in await crawl_task:
|
|
results_list.append(res)
|
|
print(f" Collected result (post-shutdown signal): {res.url} (Score: {res.metadata.get('score', 0.0):.2f})")
|
|
except asyncio.CancelledError:
|
|
print("Crawl task was cancelled.")
|
|
|
|
print(f"Crawl shut down. Processed {len(results_list)} pages before/during shutdown.")
|
|
assert len(results_list) < 10, "Crawl likely didn't shut down early enough or mock site too small."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_demonstrate_shutdown())
|
|
```
|
|
|
|
### 4.14. Example: `BestFirstCrawlingStrategy` - Explaining the effect of `BATCH_SIZE` on `arun_many`.
|
|
`BATCH_SIZE` is an internal constant in `bbf_strategy.py` (typically 10). This example explains its role rather than making it directly configurable by the user through the strategy's constructor, as it's an internal implementation detail of how the strategy uses `crawler.arun_many`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer
|
|
from unittest.mock import patch
|
|
|
|
# Note: BATCH_SIZE is internal to BestFirstCrawlingStrategy, usually 10.
|
|
# We can't directly set it, but we can explain its effect.
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def best_first_batch_size_effect():
|
|
print("--- Explaining BATCH_SIZE in BestFirstCrawlingStrategy ---")
|
|
print("BestFirstCrawlingStrategy processes URLs in batches for efficiency.")
|
|
print("Internally, it retrieves a batch of highest-priority URLs (typically up to BATCH_SIZE, e.g., 10) from its queue.")
|
|
print("It then calls `crawler.arun_many()` with this batch.")
|
|
print("This means that while URLs are prioritized, the order within a small batch might not be strictly descending by score,")
|
|
print("especially if `stream=True`, as results from `arun_many` can arrive slightly out of strict submission order.")
|
|
print("The overall crawl still heavily favors higher-scored URLs first over many batches.")
|
|
|
|
# To simulate observing this, let's run a crawl and see if groups of results are processed.
|
|
scorer = KeywordRelevanceScorer(keywords=["feature", "core", "page1", "page2"])
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=2,
|
|
url_scorer=scorer,
|
|
max_pages=6 # Small enough to potentially see batching effects if BATCH_SIZE was smaller
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
stream=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
|
|
print("\n--- Crawl Example (max_pages=6) ---")
|
|
results_in_order = []
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
if result.success:
|
|
results_in_order.append(result.metadata.get('score',0.0))
|
|
print(f" Streamed: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}")
|
|
|
|
# This assertion is hard to make definitively without knowing the exact internal BATCH_SIZE
|
|
# and perfect mock site behavior. The print statements are more illustrative.
|
|
print("\nScores in order of processing:", [f"{s:.2f}" for s in results_in_order])
|
|
print("Observe if there are small groups where order might not be strictly descending due to batch processing.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(best_first_batch_size_effect())
|
|
```
|
|
|
|
---
|
|
## 5. Configuring Filters (`FilterChain`) for Deep Crawling
|
|
|
|
Filters allow you to control which URLs are processed during a deep crawl. They are applied *before* a URL is added to the crawl queue (except for the start URL).
|
|
|
|
### 5.1. `URLPatternFilter`
|
|
|
|
#### 5.1.1. Example: Using `URLPatternFilter` to allow URLs matching specific patterns (e.g., `/blog/*`).
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_allow_pattern():
|
|
# Allow only URLs containing '/blog/'
|
|
url_filter = URLPatternFilter(patterns=["*/blog/*"])
|
|
filter_chain = FilterChain(filters=[url_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- URLPatternFilter: Allowing '*/blog/*' ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for r in results:
|
|
print(f" URL: {r.url} (Depth: {r.metadata.get('depth')})")
|
|
if r.metadata.get('depth', 0) > 0: # Check discovered URLs
|
|
assert "/blog/" in r.url, f"Page {r.url} does not match pattern."
|
|
print("All discovered pages match the allowed pattern.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_allow_pattern())
|
|
```
|
|
|
|
#### 5.1.2. Example: Using `URLPatternFilter` to block URLs matching specific patterns (e.g., `*/login/*`, `*/archive/*`).
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_block_pattern():
|
|
# Block URLs containing '/login/' or '/archive/'
|
|
url_filter = URLPatternFilter(patterns=["*/login/*", "*/archive/*"], block_list=True)
|
|
filter_chain = FilterChain(filters=[url_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- URLPatternFilter: Blocking '*/login/*' and '*/archive/*' ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for r in results:
|
|
print(f" URL: {r.url} (Depth: {r.metadata.get('depth')})")
|
|
assert "/login/" not in r.url, f"Page {r.url} should have been blocked (login)."
|
|
assert "/archive/" not in r.url, f"Page {r.url} should have been blocked (archive)."
|
|
print("No pages matching blocked patterns were crawled.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_block_pattern())
|
|
```
|
|
|
|
#### 5.1.3. Example: `URLPatternFilter` with `case_sensitive=True` vs. `case_sensitive=False`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter
|
|
from unittest.mock import patch
|
|
|
|
# Add a case-specific URL to MOCK_SITE_DATA
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/Page1.html"] = {
|
|
"html_content": "<html><head><title>Page 1 Case Test</title></head><body><p>Content for case test.</p></body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
}
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"] += '<a href="Page1.html">Page 1 Case Test</a>'
|
|
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_pattern_case_sensitivity():
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
|
|
# Case-sensitive: should only match 'page1.html'
|
|
print("\n--- URLPatternFilter: Case Sensitive (Allow '*/page1.html*') ---")
|
|
url_filter_sensitive = URLPatternFilter(patterns=["*/page1.html*"], case_sensitive=True)
|
|
filter_chain_sensitive = FilterChain(filters=[url_filter_sensitive])
|
|
strategy_sensitive = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain_sensitive)
|
|
run_config_sensitive = CrawlerRunConfig(deep_crawl_strategy=strategy_sensitive, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
results_sensitive = await crawler.arun(url=start_url, config=run_config_sensitive)
|
|
print(f"Crawled {len(results_sensitive)} pages.")
|
|
for r in results_sensitive:
|
|
print(f" URL: {r.url}")
|
|
if r.metadata.get('depth',0) > 0:
|
|
assert "page1.html" in r.url and "Page1.html" not in r.url, "Case-sensitive filter failed."
|
|
|
|
# Case-insensitive: should match both 'page1.html' and 'Page1.html'
|
|
print("\n--- URLPatternFilter: Case Insensitive (Allow '*/page1.html*') ---")
|
|
url_filter_insensitive = URLPatternFilter(patterns=["*/page1.html*"], case_sensitive=False)
|
|
filter_chain_insensitive = FilterChain(filters=[url_filter_insensitive])
|
|
strategy_insensitive = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain_insensitive)
|
|
run_config_insensitive = CrawlerRunConfig(deep_crawl_strategy=strategy_insensitive, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
results_insensitive = await crawler.arun(url=start_url, config=run_config_insensitive)
|
|
print(f"Crawled {len(results_insensitive)} pages.")
|
|
found_page1_lower = False
|
|
found_page1_upper = False
|
|
for r in results_insensitive:
|
|
print(f" URL: {r.url}")
|
|
if "page1.html" in r.url.lower(): # Check lower to catch both
|
|
if "page1.html" == Path(r.url).name: found_page1_lower = True
|
|
if "Page1.html" == Path(r.url).name: found_page1_upper = True
|
|
|
|
assert found_page1_lower and found_page1_upper, "Case-insensitive filter should have matched both cases."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_pattern_case_sensitivity())
|
|
```
|
|
|
|
### 5.2. `DomainFilter`
|
|
|
|
#### 5.2.1. Example: Using `DomainFilter` with `allowed_domains` to restrict crawling to a list of specific domains.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, DomainFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_allowed_domains():
|
|
# Only crawl within 'docs.crawl4ai.com'
|
|
domain_filter = DomainFilter(allowed_domains=["docs.crawl4ai.com"])
|
|
filter_chain = FilterChain(filters=[domain_filter])
|
|
|
|
# include_external needs to be True for DomainFilter to even consider other domains for blocking/allowing
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain, include_external=True)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" # This links to external-site.com
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- DomainFilter: Allowing only 'docs.crawl4ai.com' ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for r in results:
|
|
print(f" URL: {r.url}")
|
|
assert "docs.crawl4ai.com" in r.url, f"Page {r.url} is not from an allowed domain."
|
|
print("All crawled pages are from 'docs.crawl4ai.com'.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_allowed_domains())
|
|
```
|
|
|
|
#### 5.2.2. Example: Using `DomainFilter` with `blocked_domains` to avoid crawling certain domains.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, DomainFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_blocked_domains():
|
|
# Block 'external-site.com'
|
|
domain_filter = DomainFilter(blocked_domains=["external-site.com"])
|
|
filter_chain = FilterChain(filters=[domain_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain, include_external=True)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- DomainFilter: Blocking 'external-site.com' ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for r in results:
|
|
print(f" URL: {r.url}")
|
|
assert "external-site.com" not in r.url, f"Page {r.url} from blocked domain was crawled."
|
|
print("No pages from 'external-site.com' were crawled.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_blocked_domains())
|
|
```
|
|
|
|
#### 5.2.3. Example: `DomainFilter` configured to allow subdomains (`allow_subdomains=True`).
|
|
(Conceptual as MOCK_SITE_DATA doesn't have subdomains for `docs.crawl4ai.com`.)
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, DomainFilter
|
|
from unittest.mock import patch
|
|
|
|
# Imagine MOCK_SITE_DATA also has:
|
|
# "https://blog.docs.crawl4ai.com/vibe-examples/post.html": { ... }
|
|
# And index.html links to it.
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_allow_subdomains():
|
|
domain_filter = DomainFilter(allowed_domains=["docs.crawl4ai.com"], allow_subdomains=True)
|
|
filter_chain = FilterChain(filters=[domain_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain, include_external=True)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- DomainFilter: Allowing subdomains of 'docs.crawl4ai.com' (Conceptual) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for r in results:
|
|
print(f" URL: {r.url}")
|
|
# In a real test, you'd check if blog.docs.crawl4ai.com was included
|
|
print("This example is conceptual; for a real test, ensure mock data includes subdomains.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_allow_subdomains())
|
|
```
|
|
|
|
#### 5.2.4. Example: `DomainFilter` configured to disallow subdomains (`allow_subdomains=False`).
|
|
(Conceptual as MOCK_SITE_DATA doesn't have subdomains for `docs.crawl4ai.com`.)
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, DomainFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_disallow_subdomains():
|
|
domain_filter = DomainFilter(allowed_domains=["docs.crawl4ai.com"], allow_subdomains=False) # Default
|
|
filter_chain = FilterChain(filters=[domain_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain, include_external=True)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- DomainFilter: Disallowing subdomains of 'docs.crawl4ai.com' (Conceptual) ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for r in results:
|
|
print(f" URL: {r.url}")
|
|
# In a real test, you'd check if blog.docs.crawl4ai.com was NOT included
|
|
print("This example is conceptual; for a real test, ensure mock data includes subdomains to be excluded.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_disallow_subdomains())
|
|
```
|
|
|
|
### 5.3. `ContentTypeFilter`
|
|
|
|
#### 5.3.1. Example: Using `ContentTypeFilter` to allow only `text/html` pages.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, ContentTypeFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_allow_html_only():
|
|
content_filter = ContentTypeFilter(allowed_types=["text/html"])
|
|
filter_chain = FilterChain(filters=[content_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # Links to HTML and PDF
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- ContentTypeFilter: Allowing only 'text/html' ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for r in results:
|
|
content_type = r.response_headers.get('Content-Type', '')
|
|
print(f" URL: {r.url}, Content-Type: {content_type}")
|
|
if r.metadata.get('depth', 0) > 0: # Check discovered URLs
|
|
assert "text/html" in content_type, f"Page {r.url} has wrong content type: {content_type}"
|
|
print("All discovered pages are 'text/html'.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_allow_html_only())
|
|
```
|
|
|
|
#### 5.3.2. Example: Using `ContentTypeFilter` with multiple `allowed_types` (e.g., `text/html`, `application/json`).
|
|
(Conceptual, as MOCK_SITE_DATA only has html/pdf)
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, ContentTypeFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_allow_multiple_types():
|
|
content_filter = ContentTypeFilter(allowed_types=["text/html", "application/json"])
|
|
filter_chain = FilterChain(filters=[content_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html"
|
|
# Imagine page1.html also links to a page1_sub3.json
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1_sub3.json"] = {
|
|
"html_content": '{"key": "value"}',
|
|
"response_headers": {"Content-Type": "application/json"}
|
|
}
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1.html"]["html_content"] += '<a href="page1_sub3.json">JSON Data</a>'
|
|
|
|
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- ContentTypeFilter: Allowing 'text/html', 'application/json' ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
found_json = False
|
|
for r in results:
|
|
content_type = r.response_headers.get('Content-Type', '')
|
|
print(f" URL: {r.url}, Content-Type: {content_type}")
|
|
if r.metadata.get('depth',0) > 0:
|
|
assert "text/html" in content_type or "application/json" in content_type
|
|
if "application/json" in content_type:
|
|
found_json = True
|
|
assert found_json, "Expected to find a JSON page."
|
|
print("All discovered pages are either 'text/html' or 'application/json'.")
|
|
|
|
# Clean up mock data
|
|
del MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1_sub3.json"]
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1.html"]["html_content"] = MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1.html"]["html_content"].replace('<a href="page1_sub3.json">JSON Data</a>', '')
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_allow_multiple_types())
|
|
```
|
|
|
|
#### 5.3.3. Example: Using `ContentTypeFilter` with `blocked_types` (e.g., blocking `application/pdf`).
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, ContentTypeFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_block_pdf():
|
|
content_filter = ContentTypeFilter(blocked_types=["application/pdf"])
|
|
filter_chain = FilterChain(filters=[content_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # Links to HTML and PDF
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- ContentTypeFilter: Blocking 'application/pdf' ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for r in results:
|
|
content_type = r.response_headers.get('Content-Type', '')
|
|
print(f" URL: {r.url}, Content-Type: {content_type}")
|
|
assert "application/pdf" not in content_type, f"PDF page {r.url} was not blocked."
|
|
print("No 'application/pdf' pages were crawled (beyond start URL if it was PDF).")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_block_pdf())
|
|
```
|
|
|
|
### 5.4. `URLFilter` (Simple exact match)
|
|
|
|
#### 5.4.1. Example: `URLFilter` to allow a specific list of exact URLs.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_allow_exact_urls():
|
|
allowed_urls = [
|
|
"https://docs.crawl4ai.com/vibe-examples/page1.html",
|
|
"https://docs.crawl4ai.com/vibe-examples/page1_sub1.html"
|
|
]
|
|
url_filter = URLFilter(urls=allowed_urls, block_list=False) # Allow list
|
|
filter_chain = FilterChain(filters=[url_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=2, filter_chain=filter_chain)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- URLFilter: Allowing specific URLs ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
crawled_urls = {r.url for r in results}
|
|
# The start URL is always crawled initially, then its links are filtered.
|
|
# So we check that all *other* crawled URLs are in the allowed list.
|
|
for r_url in crawled_urls:
|
|
if r_url != start_url: # Exclude start_url from this assertion
|
|
assert r_url in allowed_urls, f"URL {r_url} was not in the allowed list."
|
|
print("Only URLs from the allowed list (plus start_url) were crawled.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_allow_exact_urls())
|
|
```
|
|
|
|
#### 5.4.2. Example: `URLFilter` to block a specific list of exact URLs.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLFilter
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_block_exact_urls():
|
|
blocked_urls = [
|
|
"https://docs.crawl4ai.com/vibe-examples/page2.html",
|
|
"https://docs.crawl4ai.com/vibe-examples/archive/old_page.html"
|
|
]
|
|
url_filter = URLFilter(urls=blocked_urls, block_list=True) # Block list
|
|
filter_chain = FilterChain(filters=[url_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- URLFilter: Blocking specific URLs ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
crawled_urls = {r.url for r in results}
|
|
for blocked_url in blocked_urls:
|
|
assert blocked_url not in crawled_urls, f"URL {blocked_url} should have been blocked."
|
|
print("Blocked URLs were not crawled.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_block_exact_urls())
|
|
```
|
|
|
|
### 5.5. `ContentRelevanceFilter`
|
|
This filter uses an LLM to determine relevance. The example focuses on setup, as a full run requires an LLM.
|
|
|
|
#### 5.5.1. Example: Setting up `ContentRelevanceFilter` with target keywords (conceptual, focusing on setup).
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, ContentRelevanceFilter, LLMConfig
|
|
|
|
# This is a conceptual example showing setup.
|
|
# A real run would require an LLM provider to be configured.
|
|
async def setup_content_relevance_filter():
|
|
print("--- Setting up ContentRelevanceFilter (Conceptual) ---")
|
|
|
|
# Define keywords and context for relevance
|
|
keywords = ["artificial intelligence", "web crawling", "data extraction"]
|
|
context_query = "Articles related to AI-powered web scraping tools and techniques."
|
|
|
|
# Configure LLM (replace with your actual provider and API key)
|
|
llm_config = LLMConfig(provider="openai/gpt-3.5-turbo", api_token="YOUR_OPENAI_API_KEY")
|
|
|
|
relevance_filter = ContentRelevanceFilter(
|
|
llm_config=llm_config,
|
|
keywords=keywords,
|
|
context_query=context_query,
|
|
threshold=0.6 # Adjust threshold as needed
|
|
)
|
|
filter_chain = FilterChain(filters=[relevance_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
print("ContentRelevanceFilter configured. To run this example:")
|
|
print("1. Replace 'YOUR_OPENAI_API_KEY' with your actual OpenAI API key.")
|
|
print("2. (Optional) Install OpenAI client: pip install openai")
|
|
print("3. Uncomment the crawler execution part below.")
|
|
|
|
# # Example of how it would be used (requires actual LLM call)
|
|
# async with AsyncWebCrawler() as crawler:
|
|
# # Mock or use a real URL that would trigger the LLM
|
|
# start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html"
|
|
# print(f"Attempting to crawl {start_url} with ContentRelevanceFilter...")
|
|
# # results = await crawler.arun(url=start_url, config=run_config)
|
|
# # print(f"Crawled {len(results)} pages after relevance filtering.")
|
|
# # for r in results:
|
|
# # print(f" URL: {r.url}, Relevance Score: {r.metadata.get('relevance_score')}")
|
|
print("Conceptual setup complete.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(setup_content_relevance_filter())
|
|
```
|
|
|
|
#### 5.5.2. Example: `ContentRelevanceFilter` with a custom `threshold`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import ContentRelevanceFilter, LLMConfig
|
|
|
|
async def content_relevance_custom_threshold():
|
|
print("--- ContentRelevanceFilter with custom threshold (Conceptual Setup) ---")
|
|
llm_config = LLMConfig(provider="openai/gpt-3.5-turbo", api_token="YOUR_OPENAI_API_KEY") # Replace
|
|
|
|
# A higher threshold means stricter relevance checking
|
|
strict_filter = ContentRelevanceFilter(
|
|
llm_config=llm_config,
|
|
keywords=["specific technical term"],
|
|
threshold=0.8
|
|
)
|
|
print(f"Strict filter created with threshold: {strict_filter.threshold}")
|
|
|
|
# A lower threshold is more lenient
|
|
lenient_filter = ContentRelevanceFilter(
|
|
llm_config=llm_config,
|
|
keywords=["general topic"],
|
|
threshold=0.4
|
|
)
|
|
print(f"Lenient filter created with threshold: {lenient_filter.threshold}")
|
|
print("Note: Actual filtering behavior depends on LLM responses to content.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(content_relevance_custom_threshold())
|
|
```
|
|
|
|
### 5.6. `SEOFilter`
|
|
This filter checks for common SEO issues. The example is conceptual, focusing on setup.
|
|
|
|
#### 5.6.1. Example: Basic `SEOFilter` with default SEO checks (conceptual, focusing on setup).
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import SEOFilter
|
|
|
|
async def setup_basic_seo_filter():
|
|
print("--- Basic SEOFilter with default checks (Conceptual Setup) ---")
|
|
|
|
# Default checks might include missing title, short meta description, etc.
|
|
seo_filter = SEOFilter()
|
|
|
|
print(f"SEOFilter created with default settings:")
|
|
print(f" Min Title Length: {seo_filter.min_title_length}")
|
|
print(f" Max Title Length: {seo_filter.max_title_length}")
|
|
print(f" Min Meta Description Length: {seo_filter.min_meta_description_length}")
|
|
# ... and other default parameters
|
|
print("This filter would be added to a FilterChain and used in a DeepCrawlStrategy.")
|
|
print("It would then check each page against these SEO criteria.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(setup_basic_seo_filter())
|
|
```
|
|
|
|
#### 5.6.2. Example: `SEOFilter` configuring specific checks like `min_title_length`, `max_meta_description_length`, or `keyword_in_title_check` (conceptual).
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import SEOFilter
|
|
|
|
async def setup_custom_seo_filter():
|
|
print("--- SEOFilter with custom checks (Conceptual Setup) ---")
|
|
|
|
custom_seo_filter = SEOFilter(
|
|
min_title_length=20,
|
|
max_meta_description_length=150,
|
|
keyword_in_title_check=True,
|
|
target_keywords_for_seo=["crawl4ai", "web scraping"] # if keyword_in_title_check is True
|
|
)
|
|
|
|
print(f"Custom SEOFilter created with:")
|
|
print(f" Min Title Length: {custom_seo_filter.min_title_length}")
|
|
print(f" Max Meta Description Length: {custom_seo_filter.max_meta_description_length}")
|
|
print(f" Keyword in Title Check: {custom_seo_filter.keyword_in_title_check}")
|
|
print(f" Target SEO Keywords: {custom_seo_filter.target_keywords_for_seo}")
|
|
print("This filter would apply these specific criteria during a crawl.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(setup_custom_seo_filter())
|
|
```
|
|
|
|
### 5.7. `FilterChain`
|
|
|
|
#### 5.7.1. Example: Combining `URLPatternFilter` (allow `/products/*`) and `DomainFilter` (only `example.com`) in a `FilterChain`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter, DomainFilter
|
|
from unittest.mock import patch
|
|
|
|
# Add mock data for this scenario
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/products/productA.html"] = {
|
|
"html_content": "<html><title>Product A</title><body>Product A details</body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
}
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"] += '<a href="products/productA.html">Product A</a>'
|
|
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_chain_combination():
|
|
product_filter = URLPatternFilter(patterns=["*/products/*"])
|
|
domain_filter = DomainFilter(allowed_domains=["docs.crawl4ai.com"])
|
|
|
|
combined_filter_chain = FilterChain(filters=[product_filter, domain_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=2, filter_chain=combined_filter_chain, include_external=True)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- FilterChain: URLPatternFilter + DomainFilter ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
for r in results:
|
|
print(f" URL: {r.url}")
|
|
if r.metadata.get('depth', 0) > 0: # Discovered URLs
|
|
assert "docs.crawl4ai.com" in r.url, "Domain filter failed."
|
|
assert "/products/" in r.url, "URL pattern filter failed."
|
|
print("All discovered pages are from 'docs.crawl4ai.com' and match '*/products/*'.")
|
|
|
|
# Clean up mock data
|
|
del MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/products/productA.html"]
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"] = MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"].replace('<a href="products/productA.html">Product A</a>', '')
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_chain_combination())
|
|
```
|
|
|
|
#### 5.7.2. Example: Using `FilterChain` with `FilterStats` to retrieve and display statistics about filtered URLs.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter, FilterStats
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_chain_with_stats():
|
|
url_filter = URLPatternFilter(patterns=["*/blog/*"], block_list=False) # Allow only blog
|
|
filter_stats = FilterStats() # Create a stats object
|
|
filter_chain = FilterChain(filters=[url_filter], stats=filter_stats) # Pass stats to chain
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- FilterChain with FilterStats ---")
|
|
print(f"Crawled {len(results)} pages.")
|
|
|
|
print("\nFilter Statistics:")
|
|
print(f" Total URLs considered by filters: {filter_stats.total_considered}")
|
|
print(f" Total URLs allowed: {filter_stats.total_allowed}")
|
|
print(f" Total URLs blocked: {filter_stats.total_blocked}")
|
|
|
|
# Based on MOCK_SITE_DATA, index links to one /blog/ page and several non-blog pages.
|
|
# Start URL itself is not subject to filter_chain in this strategy logic.
|
|
# Links from start URL: page1, page2, external, archive, blog, login
|
|
# Only /blog/post1.html should pass. 5 should be blocked.
|
|
assert filter_stats.total_considered >= 5 # Links from index.html
|
|
assert filter_stats.total_allowed >= 1 # /blog/post1.html
|
|
assert filter_stats.total_blocked >= 4 # page1, page2, external (if not implicitly blocked), archive, login
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_chain_with_stats())
|
|
```
|
|
|
|
#### 5.7.3. Example: `FilterChain` with `allow_empty=True` vs `allow_empty=False`.
|
|
This shows how `allow_empty` on the `FilterChain` itself works. If `allow_empty=True` (default), an empty chain allows all URLs. If `False`, an empty chain blocks all.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def filter_chain_allow_empty():
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
|
|
# Case 1: allow_empty=True (default) - empty chain allows all
|
|
print("\n--- FilterChain with allow_empty=True (empty chain) ---")
|
|
empty_chain_allow = FilterChain(filters=[], allow_empty=True)
|
|
strategy_allow = BFSDeePCrawlStrategy(max_depth=1, filter_chain=empty_chain_allow)
|
|
run_config_allow = CrawlerRunConfig(deep_crawl_strategy=strategy_allow, cache_mode=CacheMode.BYPASS)
|
|
async with AsyncWebCrawler() as crawler:
|
|
results_allow = await crawler.arun(url=start_url, config=run_config_allow)
|
|
print(f"Crawled {len(results_allow)} pages. (Expected > 1 as all links from index should be allowed)")
|
|
assert len(results_allow) > 1 # Start URL + its links
|
|
|
|
# Case 2: allow_empty=False - empty chain blocks all (except start URL)
|
|
print("\n--- FilterChain with allow_empty=False (empty chain) ---")
|
|
empty_chain_block = FilterChain(filters=[], allow_empty=False)
|
|
strategy_block = BFSDeePCrawlStrategy(max_depth=1, filter_chain=empty_chain_block)
|
|
run_config_block = CrawlerRunConfig(deep_crawl_strategy=strategy_block, cache_mode=CacheMode.BYPASS)
|
|
async with AsyncWebCrawler() as crawler:
|
|
results_block = await crawler.arun(url=start_url, config=run_config_block)
|
|
print(f"Crawled {len(results_block)} pages. (Expected 1, only start URL)")
|
|
assert len(results_block) == 1 # Only start_url, as all its links are blocked by empty chain
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(filter_chain_allow_empty())
|
|
```
|
|
|
|
---
|
|
## 6. Configuring Scorers (`URLScorer`) for `BestFirstCrawlingStrategy`
|
|
|
|
Scorers are used by `BestFirstCrawlingStrategy` to prioritize URLs in its crawl queue.
|
|
|
|
### 6.1. `KeywordRelevanceScorer`
|
|
|
|
#### 6.1.1. Example: `KeywordRelevanceScorer` with a list of keywords and default weight.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def scorer_keyword_default_weight():
|
|
scorer = KeywordRelevanceScorer(keywords=["feature", "core concepts"]) # Default weight is 1.0
|
|
strategy = BestFirstCrawlingStrategy(max_depth=1, url_scorer=scorer, max_pages=4)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- KeywordRelevanceScorer with default weight ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
async for result in await crawler.arun("https://docs.crawl4ai.com/vibe-examples/index.html", config=run_config):
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}")
|
|
print("Pages containing 'feature' or 'core concepts' in their URL should have higher scores.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(scorer_keyword_default_weight())
|
|
```
|
|
|
|
#### 6.1.2. Example: `KeywordRelevanceScorer` adjusting the `weight` parameter to influence its importance.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer, PathDepthScorer, CompositeScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def scorer_keyword_custom_weight():
|
|
# High weight for keywords, low for path depth
|
|
keyword_scorer = KeywordRelevanceScorer(keywords=["feature"], weight=2.0)
|
|
path_scorer = PathDepthScorer(weight=0.1, higher_score_is_better=False) # Less penalty
|
|
|
|
composite_scorer = CompositeScorer(scorers=[keyword_scorer, path_scorer])
|
|
strategy = BestFirstCrawlingStrategy(max_depth=1, url_scorer=composite_scorer, max_pages=4)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- KeywordRelevanceScorer with adjusted weight (weight=2.0) in CompositeScorer ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
async for result in await crawler.arun("https://docs.crawl4ai.com/vibe-examples/index.html", config=run_config):
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}")
|
|
print("Keyword relevance should have a stronger impact on the final score.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(scorer_keyword_custom_weight())
|
|
```
|
|
|
|
#### 6.1.3. Example: `KeywordRelevanceScorer` with `case_sensitive=True`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, KeywordRelevanceScorer
|
|
from unittest.mock import patch
|
|
|
|
# Modify mock data to have case-specific keywords in URLs
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/FEATUREpage.html"] = {
|
|
"html_content": "<html><title>FEATURE Page</title><body>Uppercase FEATURE</body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
}
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"] += '<a href="FEATUREpage.html">FEATURE Page</a>'
|
|
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def scorer_keyword_case_sensitive():
|
|
# Case-sensitive: will only score URLs with 'feature' (lowercase)
|
|
scorer_sensitive = KeywordRelevanceScorer(keywords=["feature"], case_sensitive=True)
|
|
strategy_sensitive = BestFirstCrawlingStrategy(max_depth=1, url_scorer=scorer_sensitive, max_pages=5)
|
|
run_config_sensitive = CrawlerRunConfig(deep_crawl_strategy=strategy_sensitive, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- KeywordRelevanceScorer with case_sensitive=True (keyword: 'feature') ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
async for result in await crawler.arun("https://docs.crawl4ai.com/vibe-examples/index.html", config=run_config_sensitive):
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}")
|
|
if "FEATUREpage.html" in result.url: # Uppercase 'FEATURE'
|
|
assert result.metadata.get('score', 0.0) == 0.0, "Uppercase keyword should not be scored."
|
|
elif "page2.html" in result.url: # Contains lowercase 'feature' in title/mock
|
|
assert result.metadata.get('score', 0.0) > 0.0, "Lowercase keyword should be scored."
|
|
|
|
# Clean up mock data
|
|
del MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/FEATUREpage.html"]
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"] = MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"].replace('<a href="FEATUREpage.html">FEATURE Page</a>', '')
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(scorer_keyword_case_sensitive())
|
|
```
|
|
|
|
### 6.2. `PathDepthScorer`
|
|
|
|
#### 6.2.1. Example: `PathDepthScorer` with default behavior (penalizing deeper paths).
|
|
By default, `PathDepthScorer` gives higher scores to shallower paths (depth 0 > depth 1 > depth 2).
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, PathDepthScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def scorer_path_depth_default():
|
|
scorer = PathDepthScorer() # Default: higher_score_is_better=True, depth_penalty_factor=0.1
|
|
strategy = BestFirstCrawlingStrategy(max_depth=2, url_scorer=scorer, max_pages=6)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- PathDepthScorer with default behavior (shallower is better) ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
|
|
depth_scores = {}
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
if result.success:
|
|
depth = result.metadata.get('depth')
|
|
score = result.metadata.get('score', 0.0)
|
|
print(f" URL: {result.url}, Depth: {depth}, Score: {score:.2f}")
|
|
if depth not in depth_scores:
|
|
depth_scores[depth] = []
|
|
depth_scores[depth].append(score)
|
|
|
|
if 1 in depth_scores and 2 in depth_scores and depth_scores[1] and depth_scores[2]:
|
|
avg_score_depth1 = sum(depth_scores[1]) / len(depth_scores[1])
|
|
avg_score_depth2 = sum(depth_scores[2]) / len(depth_scores[2])
|
|
print(f"Avg score depth 1: {avg_score_depth1:.2f}, Avg score depth 2: {avg_score_depth2:.2f}")
|
|
assert avg_score_depth1 > avg_score_depth2, "Shallower paths should have higher scores."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(scorer_path_depth_default())
|
|
```
|
|
|
|
#### 6.2.2. Example: `PathDepthScorer` with custom `depth_penalty_factor`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, PathDepthScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def scorer_path_depth_custom_penalty():
|
|
# Higher penalty factor means deeper paths are penalized more severely
|
|
scorer = PathDepthScorer(depth_penalty_factor=0.5, higher_score_is_better=True)
|
|
strategy = BestFirstCrawlingStrategy(max_depth=2, url_scorer=scorer, max_pages=6)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- PathDepthScorer with custom depth_penalty_factor=0.5 ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
|
|
depth_scores = {}
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
if result.success:
|
|
depth = result.metadata.get('depth')
|
|
score = result.metadata.get('score', 0.0)
|
|
print(f" URL: {result.url}, Depth: {depth}, Score: {score:.2f}")
|
|
if depth not in depth_scores:
|
|
depth_scores[depth] = []
|
|
depth_scores[depth].append(score)
|
|
|
|
if 1 in depth_scores and 2 in depth_scores and depth_scores[1] and depth_scores[2]:
|
|
avg_score_depth1 = sum(depth_scores[1]) / len(depth_scores[1])
|
|
avg_score_depth2 = sum(depth_scores[2]) / len(depth_scores[2])
|
|
print(f"Avg score depth 1: {avg_score_depth1:.2f}, Avg score depth 2: {avg_score_depth2:.2f}")
|
|
# Expect a larger difference due to higher penalty
|
|
assert (avg_score_depth1 - avg_score_depth2) > 0.05, "Higher penalty factor should result in a larger score drop for deeper paths."
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(scorer_path_depth_custom_penalty())
|
|
```
|
|
|
|
#### 6.2.3. Example: `PathDepthScorer` with `higher_score_is_better=False` (to favor deeper paths).
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, PathDepthScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def scorer_path_depth_favor_deep():
|
|
# Now, deeper paths will get higher (less negative or more positive) scores
|
|
scorer = PathDepthScorer(higher_score_is_better=False)
|
|
strategy = BestFirstCrawlingStrategy(max_depth=2, url_scorer=scorer, max_pages=6)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- PathDepthScorer with higher_score_is_better=False (favoring deeper paths) ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
|
|
depth_scores = {}
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
if result.success:
|
|
depth = result.metadata.get('depth')
|
|
score = result.metadata.get('score', 0.0)
|
|
print(f" URL: {result.url}, Depth: {depth}, Score: {score:.2f}")
|
|
if depth not in depth_scores:
|
|
depth_scores[depth] = []
|
|
depth_scores[depth].append(score)
|
|
|
|
if 1 in depth_scores and 2 in depth_scores and depth_scores[1] and depth_scores[2]:
|
|
avg_score_depth1 = sum(depth_scores[1]) / len(depth_scores[1])
|
|
avg_score_depth2 = sum(depth_scores[2]) / len(depth_scores[2])
|
|
print(f"Avg score depth 1: {avg_score_depth1:.2f}, Avg score depth 2: {avg_score_depth2:.2f}")
|
|
assert avg_score_depth2 > avg_score_depth1, "Deeper paths should have higher scores with higher_score_is_better=False."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(scorer_path_depth_favor_deep())
|
|
```
|
|
|
|
### 6.3. `ContentTypeScorer`
|
|
|
|
#### 6.3.1. Example: `ContentTypeScorer` prioritizing `text/html` and penalizing `application/pdf`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, ContentTypeScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def scorer_content_type_html_vs_pdf():
|
|
scorer = ContentTypeScorer(
|
|
content_type_weights={"text/html": 1.0, "application/pdf": -1.0, "image/jpeg": 0.2}
|
|
)
|
|
strategy = BestFirstCrawlingStrategy(max_depth=1, url_scorer=scorer, max_pages=5)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- ContentTypeScorer (HTML: 1.0, PDF: -1.0) ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # Links to HTML and PDF
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
if result.success:
|
|
content_type = result.response_headers.get('Content-Type', 'unknown')
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Type: {content_type}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(scorer_content_type_html_vs_pdf())
|
|
```
|
|
|
|
#### 6.3.2. Example: `ContentTypeScorer` with custom `content_type_weights`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, ContentTypeScorer
|
|
from unittest.mock import patch
|
|
|
|
# Add a JSON page to mock data
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/data.json"] = {
|
|
"html_content": '{"data": "sample"}', "response_headers": {"Content-Type": "application/json"}
|
|
}
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"] += '<a href="data.json">JSON Data</a>'
|
|
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def scorer_content_type_custom_weights():
|
|
scorer = ContentTypeScorer(
|
|
content_type_weights={
|
|
"application/json": 2.0, # Highly prioritize JSON
|
|
"text/html": 0.5,
|
|
"application/pdf": -2.0 # Strongly penalize PDF
|
|
}
|
|
)
|
|
strategy = BestFirstCrawlingStrategy(max_depth=1, url_scorer=scorer, max_pages=5)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- ContentTypeScorer with custom weights (JSON: 2.0, HTML: 0.5, PDF: -2.0) ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/page1.html" # Links to HTML, PDF. Index links to JSON.
|
|
|
|
# We'll crawl index to ensure JSON is discoverable
|
|
async for result in await crawler.arun("https://docs.crawl4ai.com/vibe-examples/index.html", config=run_config):
|
|
if result.success:
|
|
content_type = result.response_headers.get('Content-Type', 'unknown')
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Type: {content_type}")
|
|
|
|
del MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/data.json"]
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"] = MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"].replace('<a href="data.json">JSON Data</a>', '')
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(scorer_content_type_custom_weights())
|
|
```
|
|
|
|
### 6.4. `DomainAuthorityScorer`
|
|
|
|
#### 6.4.1. Example: Setting up `DomainAuthorityScorer` (conceptual, as DA often requires an external API or dataset).
|
|
This example shows how to instantiate and potentially use it, but actual scoring depends on external data.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import DomainAuthorityScorer
|
|
|
|
async def setup_domain_authority_scorer():
|
|
print("--- DomainAuthorityScorer (Conceptual Setup) ---")
|
|
|
|
# Conceptual: imagine you have a way to get DA scores
|
|
# da_scores = {"example.com": 90, "anotherexample.net": 70}
|
|
# scorer = DomainAuthorityScorer(domain_authority_map=da_scores, weight=1.5)
|
|
|
|
# For this example, we'll just instantiate it
|
|
scorer = DomainAuthorityScorer(weight=1.5)
|
|
print(f"DomainAuthorityScorer created with weight: {scorer.weight}")
|
|
print("To use this scorer effectively, you'd need a 'domain_authority_map' or a way to fetch DA scores.")
|
|
print("Example URL score (conceptual): ", scorer.score("https://highly-authoritative-site.com/page"))
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(setup_domain_authority_scorer())
|
|
```
|
|
|
|
### 6.5. `FreshnessScorer`
|
|
|
|
#### 6.5.1. Example: Setting up `FreshnessScorer` (conceptual, as freshness often requires parsing dates from content or headers).
|
|
This example focuses on instantiation. Actual scoring would need date extraction.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import FreshnessScorer
|
|
from datetime import datetime, timedelta
|
|
|
|
async def setup_freshness_scorer():
|
|
print("--- FreshnessScorer (Conceptual Setup) ---")
|
|
|
|
# Conceptual: the scorer would need a way to get the publication date of a URL
|
|
# For this example, we'll just instantiate it
|
|
scorer = FreshnessScorer(
|
|
max_age_days=30, # Pages older than 30 days get lower scores
|
|
date_penalty_factor=0.1 # How much to penalize per day older
|
|
)
|
|
print(f"FreshnessScorer created with max_age_days: {scorer.max_age_days}")
|
|
print("To use this, the crawling process or a pre-processor would need to extract and provide publication dates for URLs.")
|
|
|
|
# Conceptual scoring:
|
|
# recent_date = datetime.now() - timedelta(days=5)
|
|
# old_date = datetime.now() - timedelta(days=60)
|
|
# print(f"Score for recent page (mock date): {scorer.score('https://example.com/recent', publication_date=recent_date)}")
|
|
# print(f"Score for old page (mock date): {scorer.score('https://example.com/old', publication_date=old_date)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(setup_freshness_scorer())
|
|
```
|
|
|
|
### 6.6. `CompositeScorer`
|
|
|
|
#### 6.6.1. Example: Combining `KeywordRelevanceScorer` and `PathDepthScorer` using `CompositeScorer` with equal weights.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy
|
|
from crawl4ai import KeywordRelevanceScorer, PathDepthScorer, CompositeScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def composite_scorer_equal_weights():
|
|
keyword_scorer = KeywordRelevanceScorer(keywords=["feature"]) # Default weight 1.0
|
|
path_scorer = PathDepthScorer(higher_score_is_better=False) # Default weight 1.0, penalizes depth
|
|
|
|
# Equal weighting by default if weights list not provided or all weights are same
|
|
composite_scorer = CompositeScorer(scorers=[keyword_scorer, path_scorer])
|
|
|
|
strategy = BestFirstCrawlingStrategy(max_depth=1, url_scorer=composite_scorer, max_pages=5)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- CompositeScorer with equal weights for Keyword and PathDepth ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Depth: {result.metadata.get('depth')}")
|
|
print("Scores are an equal combination of keyword relevance and path depth penalty.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(composite_scorer_equal_weights())
|
|
```
|
|
|
|
#### 6.6.2. Example: `CompositeScorer` assigning different `weights` to prioritize one scorer over another.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy
|
|
from crawl4ai import KeywordRelevanceScorer, PathDepthScorer, CompositeScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def composite_scorer_different_weights():
|
|
# Keyword relevance is more important
|
|
keyword_scorer = KeywordRelevanceScorer(keywords=["feature"])
|
|
path_scorer = PathDepthScorer(higher_score_is_better=False)
|
|
|
|
composite_scorer = CompositeScorer(
|
|
scorers=[keyword_scorer, path_scorer],
|
|
weights=[0.8, 0.2] # Keyword scorer has 80% influence, PathDepth 20%
|
|
)
|
|
|
|
strategy = BestFirstCrawlingStrategy(max_depth=1, url_scorer=composite_scorer, max_pages=5)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- CompositeScorer with different weights (Keyword: 0.8, PathDepth: 0.2) ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Depth: {result.metadata.get('depth')}")
|
|
print("Keyword relevance should more heavily influence scores.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(composite_scorer_different_weights())
|
|
```
|
|
|
|
#### 6.6.3. Example: Nesting `CompositeScorer` for more complex scoring logic.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy
|
|
from crawl4ai import KeywordRelevanceScorer, PathDepthScorer, ContentTypeScorer, CompositeScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def composite_scorer_nesting():
|
|
keyword_scorer = KeywordRelevanceScorer(keywords=["feature"])
|
|
path_scorer = PathDepthScorer(higher_score_is_better=False)
|
|
content_type_scorer = ContentTypeScorer(content_type_weights={"text/html": 1.0, "application/pdf": -1.0})
|
|
|
|
# First level composite: keyword and path
|
|
relevance_and_structure_scorer = CompositeScorer(
|
|
scorers=[keyword_scorer, path_scorer],
|
|
weights=[0.7, 0.3]
|
|
)
|
|
|
|
# Second level composite: combine above with content type
|
|
final_scorer = CompositeScorer(
|
|
scorers=[relevance_and_structure_scorer, content_type_scorer],
|
|
weights=[0.8, 0.2] # Relevance/structure is 80%, content type 20%
|
|
)
|
|
|
|
strategy = BestFirstCrawlingStrategy(max_depth=1, url_scorer=final_scorer, max_pages=5)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- Nested CompositeScorer ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}, Depth: {result.metadata.get('depth')}, Type: {result.response_headers.get('Content-Type')}")
|
|
print("Scores reflect a nested combination of keyword, path, and content type.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(composite_scorer_nesting())
|
|
```
|
|
|
|
---
|
|
## 7. General Deep Crawl Configuration and Usage
|
|
|
|
### 7.1. Example: Deep crawling a site that relies heavily on JavaScript for link generation.
|
|
This example demonstrates the setup. A real JS-heavy site would be needed for full verification.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, BrowserConfig
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def deep_crawl_js_heavy_site():
|
|
# BrowserConfig enables JS by default.
|
|
# For very JS-heavy sites, ensure headless=False if debugging, and consider timeouts.
|
|
browser_cfg = BrowserConfig(headless=True) # Keep headless for automated tests
|
|
|
|
# CrawlerRunConfig might need adjustments for JS execution time
|
|
run_cfg = CrawlerRunConfig(
|
|
page_timeout=30000, # 30 seconds, might need more for complex JS
|
|
# js_code can be used to trigger actions if needed before link discovery
|
|
# js_code="window.scrollTo(0, document.body.scrollHeight);", # Example to scroll
|
|
deep_crawl_strategy=BFSDeePCrawlStrategy(max_depth=1, max_pages=3),
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
print("--- Deep Crawling a JS-Heavy Site (Conceptual: JS execution is enabled by default) ---")
|
|
# Using index.html which has a JS-triggered link via onclick
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
|
|
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
|
results = await crawler.arun(url=start_url, config=run_cfg)
|
|
|
|
print(f"Crawled {len(results)} pages.")
|
|
js_link_found = False
|
|
for result in results:
|
|
print(f" URL: {result.url}")
|
|
if "js_page.html" in result.url:
|
|
js_link_found = True
|
|
|
|
# This assertion relies on the MockAsyncWebCrawler's _fetch_page
|
|
# correctly parsing links from html_content, even if added by mock JS.
|
|
# A more robust test would involve Playwright's own JS execution.
|
|
# For now, we assume the mock crawler finds links from the final HTML state.
|
|
# To truly test JS-driven links, one would need to modify MockAsyncWebCrawler
|
|
# to simulate JS execution or use a real browser test.
|
|
# This example mainly shows the configuration for enabling JS.
|
|
print("Note: True JS-link discovery depends on Playwright's execution within the crawler.")
|
|
print("The mock crawler simulates link finding from final HTML state.")
|
|
# assert js_link_found, "JS-generated link was not found. Mock might need adjustment or real browser test."
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(deep_crawl_js_heavy_site())
|
|
```
|
|
|
|
### 7.2. Example: How `CrawlerRunConfig` parameters (e.g., `page_timeout`) and `BrowserConfig` (e.g., `user_agent`, `proxy_config`) affect underlying page fetches.
|
|
This shows how `BrowserConfig` (passed to `AsyncWebCrawler`) and `CrawlerRunConfig` (passed to `arun`) influence individual page fetches.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, BrowserConfig, ProxyConfig
|
|
from unittest.mock import patch
|
|
|
|
# Mocking a proxy server check - in reality, you'd use a real proxy
|
|
async def mock_check_ip_via_proxy(url, config):
|
|
# This function would normally make a request through the proxy
|
|
# and return the perceived IP. For mock, we'll just simulate.
|
|
if config and config.proxy_config and config.proxy_config.server == "http://mockproxy.com:8080":
|
|
return "1.2.3.4" # Mocked IP if proxy is used
|
|
return "9.8.7.6" # Mocked direct IP
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def deep_crawl_with_configs():
|
|
browser_cfg = BrowserConfig(
|
|
user_agent="MyCustomDeepCrawler/1.0",
|
|
proxy_config=ProxyConfig(server="http://mockproxy.com:8080") # This should be used by crawler
|
|
)
|
|
|
|
# For deep crawl, the page_timeout in CrawlerRunConfig applies to each page fetch
|
|
run_cfg = CrawlerRunConfig(
|
|
page_timeout=15000, # 15s timeout for each page in the deep crawl
|
|
deep_crawl_strategy=BFSDeePCrawlStrategy(max_depth=0), # Just the start URL
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
print("--- Deep Crawl with Custom Browser & Run Configs ---")
|
|
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
|
# The crawler instance now has the browser_cfg settings.
|
|
# We expect its internal page fetches to use these.
|
|
|
|
# We'd need to inspect logs or mock `crawler.strategy._fetch_page` to truly verify user_agent/proxy.
|
|
# For this example, we'll conceptually check based on setup.
|
|
print(f"Browser User-Agent set to: {crawler.browser_config.user_agent}")
|
|
if crawler.browser_config.proxy_config:
|
|
print(f"Browser Proxy set to: {crawler.browser_config.proxy_config.server}")
|
|
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_cfg)
|
|
|
|
if results and results[0].success:
|
|
print(f"Crawled {results[0].url} successfully with page_timeout={run_cfg.page_timeout}ms")
|
|
# In a real scenario with a proxy, you'd verify the source IP.
|
|
# For mock:
|
|
# perceived_ip = await mock_check_ip_via_proxy(start_url, browser_cfg)
|
|
# print(f"Perceived IP (mocked): {perceived_ip}")
|
|
# assert perceived_ip == "1.2.3.4" # Assuming proxy was used
|
|
else:
|
|
print(f"Crawl failed for {start_url}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(deep_crawl_with_configs())
|
|
```
|
|
|
|
### 7.3. Example: Iterating through deep crawl results and handling cases where some pages failed to crawl or were filtered out.
|
|
A robust deep crawl should handle partial failures gracefully.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain, URLPatternFilter
|
|
from unittest.mock import patch
|
|
|
|
# Add a URL that will "fail" in our mock
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/failing_page.html"] = {
|
|
"html_content": None, # Simulate failure by not providing content
|
|
"success": False,
|
|
"status_code": 500,
|
|
"error_message": "Mock Server Error"
|
|
}
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"] += '<a href="failing_page.html">Failing Page</a>'
|
|
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def deep_crawl_handling_failures():
|
|
# Filter out '/archive/' pages, and one page will fail
|
|
url_filter = URLPatternFilter(patterns=["*/archive/*"], block_list=True)
|
|
filter_chain = FilterChain(filters=[url_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"--- Deep Crawl - Handling Failures and Filtered Pages ---")
|
|
successful_pages = 0
|
|
failed_pages = 0
|
|
|
|
for result in results:
|
|
if result.success:
|
|
successful_pages += 1
|
|
print(f" SUCCESS: {result.url} (Depth: {result.metadata.get('depth')})")
|
|
assert "/archive/" not in result.url
|
|
else:
|
|
failed_pages += 1
|
|
print(f" FAILURE: {result.url} (Error: {result.error_message}, Status: {result.status_code})")
|
|
|
|
print(f"\nTotal Successful: {successful_pages}, Total Failed/Filtered Out by crawler: {failed_pages}")
|
|
# Start URL + index links (page1, page2, external, blog, login, failing) = 7 initial candidates
|
|
# - external might be skipped by default include_external=False (depends on strategy)
|
|
# - /archive/ is filtered by URLPatternFilter
|
|
# - failing_page.html will fail
|
|
# So, we expect start_url + page1, page2, blog, login. Failing page is in results but success=False.
|
|
# The number of results includes the start_url and pages that were attempted.
|
|
# Filters apply to links *discovered* from a page.
|
|
|
|
# One page (/archive/old_page.html) should be filtered by the filter chain.
|
|
# One page (failing_page.html) should be in results but with success=False.
|
|
assert any("failing_page.html" in r.url and not r.success for r in results)
|
|
assert not any("/archive/" in r.url for r in results)
|
|
|
|
# Clean up mock data
|
|
del MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/failing_page.html"]
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"] = MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/index.html"]["html_content"].replace('<a href="failing_page.html">Failing Page</a>', '')
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(deep_crawl_handling_failures())
|
|
```
|
|
|
|
### 7.4. Example: Using a custom `logger` instance passed to a `DeepCrawlStrategy`.
|
|
|
|
```python
|
|
import asyncio
|
|
import logging
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, AsyncLogger
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def deep_crawl_custom_logger():
|
|
# Setup a custom logger
|
|
custom_logger = AsyncLogger(log_file="custom_deep_crawl.log", name="MyDeepCrawler", level="DEBUG")
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=0, logger=custom_logger) # Pass logger to strategy
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
print("--- Deep Crawl with Custom Logger ---")
|
|
async with AsyncWebCrawler() as crawler: # Main crawler logger can be default
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print("Crawl complete. Check 'custom_deep_crawl.log' for logs from the strategy.")
|
|
# You can verify the log file content here if needed
|
|
# e.g., with open("custom_deep_crawl.log", "r") as f: assert "MyDeepCrawler" in f.read()
|
|
# For this example, just visual confirmation is sufficient.
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(deep_crawl_custom_logger())
|
|
```
|
|
|
|
### 7.5. Example: Deep crawling starting from a local HTML file that contains links to other local files or web URLs.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
from pathlib import Path
|
|
import os
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def deep_crawl_from_local_file():
|
|
# Ensure the mock local files exist for the test
|
|
local_index_path = Path(os.getcwd()) / "test_local_index.html"
|
|
local_page1_path = Path(os.getcwd()) / "test_local_page1.html"
|
|
|
|
# If not created by preamble, create them
|
|
if not local_index_path.exists():
|
|
local_index_path.write_text(MOCK_SITE_DATA[f"file://{local_index_path}"]["html_content"])
|
|
if not local_page1_path.exists():
|
|
local_page1_path.write_text(MOCK_SITE_DATA[f"file://{local_page1_path}"]["html_content"])
|
|
|
|
start_file_url = f"file://{local_index_path.resolve()}"
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, include_external=True) # Allow following to web URLs
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
print(f"--- Deep Crawling from Local File: {start_file_url} ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
results = await crawler.arun(url=start_file_url, config=run_config)
|
|
|
|
print(f"Crawled {len(results)} pages.")
|
|
found_local_link = False
|
|
found_web_link = False
|
|
for result in results:
|
|
print(f" URL: {result.url}, Depth: {result.metadata.get('depth')}")
|
|
if result.url == f"file://{local_page1_path.resolve()}":
|
|
found_local_link = True
|
|
if result.url == "https://docs.crawl4ai.com/vibe-examples/index.html":
|
|
found_web_link = True
|
|
|
|
assert found_local_link, "Did not follow local file link."
|
|
assert found_web_link, "Did not follow web link from local file."
|
|
|
|
# Clean up dummy files
|
|
if local_index_path.exists(): os.remove(local_index_path)
|
|
if local_page1_path.exists(): os.remove(local_page1_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(deep_crawl_from_local_file())
|
|
```
|
|
|
|
### 7.6. Example: Comparing outputs from `BFSDeePCrawlStrategy`, `DFSDeePCrawlStrategy`, and `BestFirstCrawlingStrategy`.
|
|
This example runs all three main strategies with similar settings to highlight differences in traversal and results.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
from crawl4ai import BFSDeePCrawlStrategy, DFSDeePCrawlStrategy, BestFirstCrawlingStrategy, KeywordRelevanceScorer
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def compare_deep_crawl_strategies():
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
max_depth = 2
|
|
max_pages = 7 # Keep it manageable for comparison
|
|
|
|
common_config_params = {
|
|
"max_depth": max_depth,
|
|
"max_pages": max_pages,
|
|
"include_external": False, # Keep it simple for comparison
|
|
}
|
|
|
|
scorer = KeywordRelevanceScorer(keywords=["feature", "core"])
|
|
|
|
strategies_to_compare = {
|
|
"BFS": BFSDeePCrawlStrategy(**common_config_params),
|
|
"DFS": DFSDeePCrawlStrategy(**common_config_params),
|
|
"Best-First": BestFirstCrawlingStrategy(**common_config_params, url_scorer=scorer)
|
|
}
|
|
|
|
print(f"--- Comparing Deep Crawl Strategies (max_depth={max_depth}, max_pages={max_pages}) ---")
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
for name, strategy_instance in strategies_to_compare.items():
|
|
print(f"\n-- Running {name} Strategy --")
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy_instance,
|
|
cache_mode=CacheMode.BYPASS,
|
|
stream=False # Batch for easier comparison of final set
|
|
)
|
|
|
|
start_time = time.perf_counter()
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
duration = time.perf_counter() - start_time
|
|
|
|
print(f" {name} crawled {len(results)} pages in {duration:.2f}s.")
|
|
# Sort by depth then URL for consistent output for BFS/DFS
|
|
# For Best-First, sort by score (desc) then depth then URL
|
|
if name == "Best-First":
|
|
sorted_results = sorted(results, key=lambda r: (r.metadata.get('score', 0.0), -r.metadata.get('depth', 0), r.url), reverse=True)
|
|
else:
|
|
sorted_results = sorted(results, key=lambda r: (r.metadata.get('depth', 0), r.url))
|
|
|
|
|
|
for i, r in enumerate(sorted_results):
|
|
if i < 5 or i > len(sorted_results) - 3 : # Show first 5 and last 2
|
|
score_str = f", Score: {r.metadata.get('score', 0.0):.2f}" if name == "Best-First" else ""
|
|
print(f" URL: {r.url} (Depth: {r.metadata.get('depth')}{score_str})")
|
|
elif i == 5:
|
|
print(f" ... ({len(sorted_results) - 5 -2 } more results) ...")
|
|
print("-" * 30)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(compare_deep_crawl_strategies())
|
|
```
|
|
|
|
---
|
|
## 8. Advanced Scenarios & Customization
|
|
|
|
### 8.1. Example: Implementing a custom `DeepCrawlStrategy` by subclassing `DeepCrawlStrategy`.
|
|
This provides a skeleton for creating your own crawl logic.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DeepCrawlStrategy, CrawlResult
|
|
from typing import List, Set, Dict, AsyncGenerator, Tuple
|
|
from unittest.mock import patch
|
|
|
|
class MyCustomDeepCrawlStrategy(DeepCrawlStrategy):
|
|
def __init__(self, max_depth=1, **kwargs):
|
|
self.max_depth = max_depth
|
|
# Potentially other custom init params
|
|
super().__init__(**kwargs) # Pass along other kwargs if base class uses them
|
|
print("MyCustomDeepCrawlStrategy Initialized")
|
|
|
|
async def _arun_batch(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig) -> List[CrawlResult]:
|
|
print(f"[Custom Strategy] _arun_batch called for: {start_url}")
|
|
# Implement batch crawling logic (e.g., BFS-like)
|
|
# This is a simplified version. A real one needs queue, visited set, depth tracking etc.
|
|
results = []
|
|
initial_result_container = await crawler.arun(url=start_url, config=config.clone(deep_crawl_strategy=None))
|
|
initial_result = initial_result_container[0] # arun returns a list
|
|
|
|
if not initial_result.success: return [initial_result]
|
|
results.append(initial_result)
|
|
|
|
if self.max_depth > 0 and initial_result.links.get("internal"):
|
|
for link_info in initial_result.links["internal"][:2]: # Crawl first 2 internal links
|
|
link_url = link_info["href"]
|
|
# Pass metadata for depth and parent
|
|
link_config = config.clone(deep_crawl_strategy=None)
|
|
|
|
# In a real strategy, you'd manage metadata directly or pass it for crawler.arun
|
|
# For this mock, we simplify as crawler.arun normally doesn't take depth/parent for single page
|
|
print(f" [Custom Strategy] Crawling linked URL: {link_url} at depth 1")
|
|
linked_result_container = await crawler.arun(url=link_url, config=link_config)
|
|
linked_result = linked_result_container[0]
|
|
# Manually add metadata for this example
|
|
if linked_result.metadata is None: linked_result.metadata = {}
|
|
linked_result.metadata['depth'] = 1
|
|
linked_result.metadata['parent_url'] = start_url
|
|
results.append(linked_result)
|
|
return results
|
|
|
|
async def _arun_stream(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig) -> AsyncGenerator[CrawlResult, None]:
|
|
print(f"[Custom Strategy] _arun_stream called for: {start_url}")
|
|
# Implement streaming crawling logic
|
|
# Simplified: yields results from a batch-like process for this example
|
|
batch_results = await self._arun_batch(start_url, crawler, config)
|
|
for result in batch_results:
|
|
yield result
|
|
|
|
async def can_process_url(self, url: str, depth: int) -> bool:
|
|
# Example: only process URLs not containing "archive" and within max_depth
|
|
print(f"[Custom Strategy] can_process_url called for: {url}, depth: {depth}")
|
|
if "archive" in url:
|
|
return False
|
|
return depth <= self.max_depth
|
|
|
|
async def link_discovery(
|
|
self, result: CrawlResult, source_url: str, current_depth: int,
|
|
visited: Set[str], next_level: List[Tuple[str, str]], depths: Dict[str, int]
|
|
) -> None:
|
|
# This method is crucial for discovering and queuing new links.
|
|
# The base class might have a default implementation, or you might need to call
|
|
# crawler.arun to get links if result.links is not populated.
|
|
# For this example, we'll assume result.links is populated by the crawler.
|
|
print(f"[Custom Strategy] link_discovery for: {source_url} at depth {current_depth}")
|
|
new_depth = current_depth + 1
|
|
if new_depth > self.max_depth:
|
|
return
|
|
|
|
for link_info in result.links.get("internal", [])[:3]: # Limit for example
|
|
link_url = link_info["href"]
|
|
if link_url not in visited and await self.can_process_url(link_url, new_depth):
|
|
next_level.append((link_url, source_url)) # (url, parent_url)
|
|
depths[link_url] = new_depth
|
|
print(f" [Custom Strategy] Discovered and added to queue: {link_url}")
|
|
|
|
async def shutdown(self):
|
|
print("[Custom Strategy] Shutdown called.")
|
|
# Implement any cleanup or signal to stop crawling loops
|
|
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def custom_deep_crawl_strategy_example():
|
|
custom_strategy = MyCustomDeepCrawlStrategy(max_depth=1)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=custom_strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
print("--- Using Custom DeepCrawlStrategy ---")
|
|
async with AsyncWebCrawler() as crawler: # This will be MockAsyncWebCrawler
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"\nCustom strategy crawled {len(results)} pages:")
|
|
for r in results:
|
|
print(f" URL: {r.url}, Success: {r.success}, Depth: {r.metadata.get('depth') if r.metadata else 'N/A'}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(custom_deep_crawl_strategy_example())
|
|
```
|
|
|
|
### 8.2. Example: Implementing a custom `URLFilter`.
|
|
`URLFilter` itself is a concrete class, but you can create custom logic by making a callable class or function that adheres to the expected filter signature `(url: str) -> bool`. For more complex stateful filters, subclassing a base might be an option if one is provided or creating your own structure.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, FilterChain
|
|
from unittest.mock import patch
|
|
|
|
class MyCustomURLFilter:
|
|
def __init__(self, forbidden_keyword: str):
|
|
self.forbidden_keyword = forbidden_keyword.lower()
|
|
print(f"MyCustomURLFilter initialized to block URLs with '{self.forbidden_keyword}'")
|
|
|
|
async def __call__(self, url: str) -> bool: # Filters must be async
|
|
"""Return True if URL should be allowed, False if blocked."""
|
|
if self.forbidden_keyword in url.lower():
|
|
print(f"[CustomFilter] Blocking URL: {url} (contains '{self.forbidden_keyword}')")
|
|
return False # Block if keyword found
|
|
print(f"[CustomFilter] Allowing URL: {url}")
|
|
return True # Allow otherwise
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def custom_url_filter_example():
|
|
custom_filter = MyCustomURLFilter(forbidden_keyword="archive")
|
|
filter_chain = FilterChain(filters=[custom_filter])
|
|
|
|
strategy = BFSDeePCrawlStrategy(max_depth=1, filter_chain=filter_chain)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
print("--- Using Custom URLFilter (blocking 'archive') ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"\nCustom filter crawl resulted in {len(results)} pages:")
|
|
for r in results:
|
|
print(f" URL: {r.url}")
|
|
assert "archive" not in r.url.lower(), f"Custom filter failed to block {r.url}"
|
|
print("Successfully blocked URLs containing 'archive'.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(custom_url_filter_example())
|
|
```
|
|
|
|
### 8.3. Example: Implementing a custom `URLScorer` for `BestFirstCrawlingStrategy`.
|
|
Subclass `URLScorer` and implement the `score` method.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BestFirstCrawlingStrategy, URLScorer
|
|
from urllib.parse import urlparse
|
|
from unittest.mock import patch
|
|
|
|
class MyCustomURLScorer(URLScorer):
|
|
def __init__(self, preferred_domain: str, weight: float = 1.0):
|
|
super().__init__(weight)
|
|
self.preferred_domain = preferred_domain
|
|
print(f"MyCustomURLScorer initialized, preferring domain: {self.preferred_domain}")
|
|
|
|
def score(self, url: str, **kwargs) -> float:
|
|
"""Scores URL based on whether it matches the preferred domain."""
|
|
parsed_url = urlparse(url)
|
|
score = 0.0
|
|
if parsed_url.netloc == self.preferred_domain:
|
|
score = 1.0 * self.weight
|
|
print(f"[CustomScorer] URL {url} matches preferred domain. Score: {score}")
|
|
else:
|
|
score = 0.1 * self.weight # Lower score for other domains
|
|
print(f"[CustomScorer] URL {url} does NOT match preferred domain. Score: {score}")
|
|
return score
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def custom_url_scorer_example():
|
|
custom_scorer = MyCustomURLScorer(preferred_domain="docs.crawl4ai.com", weight=2.0)
|
|
|
|
strategy = BestFirstCrawlingStrategy(
|
|
max_depth=1,
|
|
url_scorer=custom_scorer,
|
|
include_external=True, # To allow scoring external domains differently
|
|
max_pages=5
|
|
)
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=strategy, cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
print("--- Using Custom URLScorer (preferring 'docs.crawl4ai.com') ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
if result.success:
|
|
print(f" URL: {result.url}, Score: {result.metadata.get('score', 0.0):.2f}")
|
|
print("Pages from 'docs.crawl4ai.com' should generally have higher scores.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(custom_url_scorer_example())
|
|
```
|
|
|
|
### 8.4. Example: Deep crawling a site with very large number of pages efficiently using `max_pages` and streaming.
|
|
This combines `max_pages` to limit the scope and `stream=True` to process results incrementally, which is crucial for very large crawls to manage memory and get feedback sooner.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy
|
|
from unittest.mock import patch
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def deep_crawl_large_site_efficiently():
|
|
# Simulate a large site by setting a high conceptual depth,
|
|
# but limit actual work with max_pages.
|
|
strategy = BFSDeePCrawlStrategy(
|
|
max_depth=10, # Imagine this could lead to thousands of pages
|
|
max_pages=10, # But we only want the first 10 found by BFS
|
|
include_external=False
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=strategy,
|
|
stream=True, # Process results as they come
|
|
cache_mode=CacheMode.BYPASS # Or CacheMode.ENABLED for subsequent partial crawls
|
|
)
|
|
|
|
print("--- Efficiently Crawling a 'Large' Site (max_pages=10, stream=True) ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html" # Use our mock site
|
|
|
|
crawled_count = 0
|
|
async for result in await crawler.arun(url=start_url, config=run_config):
|
|
crawled_count += 1
|
|
if result.success:
|
|
print(f" Processed ({crawled_count}/{strategy.max_pages}): {result.url} at depth {result.metadata.get('depth')}")
|
|
else:
|
|
print(f" Failed ({crawled_count}/{strategy.max_pages}): {result.url} - {result.error_message}")
|
|
|
|
if crawled_count >= strategy.max_pages:
|
|
print(f"Reached max_pages limit of {strategy.max_pages}. Stopping.")
|
|
# In a real scenario, you might need to call strategy.shutdown() if the crawler
|
|
# doesn't automatically stop precisely at max_pages when streaming.
|
|
# However, strategies are designed to respect max_pages.
|
|
break
|
|
|
|
print(f"\nTotal pages processed: {crawled_count}")
|
|
assert crawled_count <= strategy.max_pages
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(deep_crawl_large_site_efficiently())
|
|
```
|
|
|
|
### 8.5. Example: Combining deep crawling with `LLMExtractionStrategy` to extract structured data from each crawled page.
|
|
This example shows setting up a deep crawl where each successfully crawled page's content is then passed to an `LLMExtractionStrategy`.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, LLMExtractionStrategy, LLMConfig
|
|
from pydantic import BaseModel, Field
|
|
from unittest.mock import patch
|
|
|
|
class PageSummary(BaseModel):
|
|
title: str = Field(description="The main title of the page.")
|
|
brief_summary: str = Field(description="A one-sentence summary of the page content.")
|
|
|
|
# Mock the LLM call within the extraction strategy for this example
|
|
async def mock_llm_extract(self, url: str, sections: list[str]):
|
|
print(f"[Mock LLM] Extracting from {url}, first section: {sections[0][:50]}...")
|
|
# Based on the URL from MOCK_SITE_DATA, return a plausible mock summary
|
|
if "index.html" in url:
|
|
return [{"title": "Index", "brief_summary": "This is the main page."}]
|
|
elif "page1.html" in url:
|
|
return [{"title": "Page 1", "brief_summary": "Content about crawl strategies."}]
|
|
elif "page2.html" in url:
|
|
return [{"title": "Page 2 - Feature Rich", "brief_summary": "Discusses a key feature."}]
|
|
return [{"title": "Unknown Title", "brief_summary": "Could not summarize."}]
|
|
|
|
@patch('crawl4ai.extraction_strategy.LLMExtractionStrategy.run', side_effect=mock_llm_extract)
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def deep_crawl_with_llm_extraction(mock_llm_run): # mock_llm_run is from the patch
|
|
llm_config = LLMConfig(provider="mock/mock-model") # Mock provider
|
|
|
|
extraction_strategy = LLMExtractionStrategy(
|
|
llm_config=llm_config,
|
|
schema=PageSummary.model_json_schema(), # Use Pydantic model for schema
|
|
extraction_type="schema",
|
|
instruction="Extract the title and a brief summary for the provided HTML content."
|
|
)
|
|
|
|
deep_crawl_config = BFSDeePCrawlStrategy(max_depth=1, max_pages=3)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
deep_crawl_strategy=deep_crawl_config,
|
|
extraction_strategy=extraction_strategy, # Apply this to each crawled page
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
print("--- Deep Crawl with LLM Extraction on Each Page ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
for result in results:
|
|
if result.success:
|
|
print(f"\nCrawled URL: {result.url}")
|
|
if result.extracted_content:
|
|
print(f" Extracted Data: {result.extracted_content}")
|
|
else:
|
|
print(" No data extracted (or LLM mock returned empty).")
|
|
else:
|
|
print(f"\nFailed to crawl URL: {result.url} - {result.error_message}")
|
|
|
|
assert mock_llm_run.called, "LLM Extraction strategy's run method was not called."
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(deep_crawl_with_llm_extraction())
|
|
```
|
|
|
|
### 8.6. Example: Scenario for using `can_process_url` within a strategy to dynamically decide if a URL should be added to the queue.
|
|
Override `can_process_url` in a custom strategy to implement dynamic filtering logic based on URL and current depth.
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeePCrawlStrategy, CrawlResult
|
|
from typing import List, Set, Dict, Tuple
|
|
from unittest.mock import patch
|
|
|
|
class DepthAndPatternAwareBFSStrategy(BFSDeePCrawlStrategy):
|
|
async def can_process_url(self, url: str, depth: int) -> bool:
|
|
# Standard checks from parent (like filter_chain)
|
|
if not await super().can_process_url(url, depth):
|
|
print(f"[Custom can_process_url] Blocked by parent: {url}")
|
|
return False
|
|
|
|
# Custom logic: Do not process '/archive/' pages if depth is > 1
|
|
if depth > 1 and "/archive/" in url:
|
|
print(f"[Custom can_process_url] Blocking deep archive page: {url} at depth {depth}")
|
|
return False
|
|
|
|
print(f"[Custom can_process_url] Allowing: {url} at depth {depth}")
|
|
return True
|
|
|
|
@patch('crawl4ai.AsyncWebCrawler', MockAsyncWebCrawler)
|
|
async def custom_can_process_url_example():
|
|
# Add a deeper archive link for testing
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1.html"]["html_content"] += '<a href="archive/deep_archive.html">Deep Archive</a>'
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/archive/deep_archive.html"] = {
|
|
"html_content": "<html><title>Deep Archive</title><body>Very old stuff.</body></html>",
|
|
"response_headers": {"Content-Type": "text/html"}
|
|
}
|
|
|
|
custom_strategy = DepthAndPatternAwareBFSStrategy(max_depth=2) # Crawl up to depth 2
|
|
run_config = CrawlerRunConfig(deep_crawl_strategy=custom_strategy, cache_mode=CacheMode.BYPASS)
|
|
|
|
print("--- Custom Strategy with Dynamic can_process_url ---")
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_url = "https://docs.crawl4ai.com/vibe-examples/index.html"
|
|
results = await crawler.arun(url=start_url, config=run_config)
|
|
|
|
print(f"\nCrawled {len(results)} pages:")
|
|
archive_at_depth_1_crawled = False
|
|
deep_archive_blocked = True
|
|
|
|
for r in results:
|
|
print(f" URL: {r.url}, Depth: {r.metadata.get('depth')}")
|
|
if "/archive/old_page.html" in r.url and r.metadata.get('depth') == 1:
|
|
archive_at_depth_1_crawled = True
|
|
if "/archive/deep_archive.html" in r.url and r.metadata.get('depth') == 2:
|
|
# This should not happen due to our custom can_process_url
|
|
deep_archive_blocked = False
|
|
|
|
assert archive_at_depth_1_crawled, "Archive page at depth 1 should have been crawled."
|
|
assert deep_archive_blocked, "Deep archive page at depth 2 should have been blocked by custom can_process_url."
|
|
print("Dynamic URL processing logic worked as expected.")
|
|
|
|
# Clean up mock data
|
|
MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1.html"]["html_content"] = MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/page1.html"]["html_content"].replace('<a href="archive/deep_archive.html">Deep Archive</a>', '')
|
|
del MOCK_SITE_DATA["https://docs.crawl4ai.com/vibe-examples/archive/deep_archive.html"]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(custom_can_process_url_example())
|
|
# Clean up dummy files after all examples run
|
|
if (Path(os.getcwd()) / "test_local_index.html").exists():
|
|
os.remove(Path(os.getcwd()) / "test_local_index.html")
|
|
if (Path(os.getcwd()) / "test_local_page1.html").exists():
|
|
os.remove(Path(os.getcwd()) / "test_local_page1.html")
|
|
if Path("custom_deep_crawl.log").exists():
|
|
os.remove("custom_deep_crawl.log")
|
|
|
|
``` |