14 KiB
I'll expand the outline with detailed descriptions and examples based on all the provided files. I'll start with the first few sections:
1. Basic Web Crawling
Basic web crawling provides the foundation for extracting content from websites. The library supports both simple single-page crawling and recursive website crawling.
# Simple page crawling
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com")
print(result.html) # Raw HTML
print(result.markdown) # Cleaned markdown
print(result.cleaned_html) # Cleaned HTML
# Recursive website crawling
class SimpleWebsiteScraper:
def __init__(self, crawler: AsyncWebCrawler):
self.crawler = crawler
async def scrape(self, start_url: str, max_depth: int):
results = await self.scrape_recursive(start_url, max_depth)
return results
# Usage
async with AsyncWebCrawler() as crawler:
scraper = SimpleWebsiteScraper(crawler)
results = await scraper.scrape("https://example.com", depth=2)
2. Browser Control Options
The library provides extensive control over browser behavior, allowing customization of browser type, headless mode, and proxy settings.
# Browser Type Selection
async with AsyncWebCrawler(
browser_type="firefox", # Options: "chromium", "firefox", "webkit"
headless=False, # For visible browser
verbose=True # Enable logging
) as crawler:
result = await crawler.arun(url="https://example.com")
# Proxy Configuration
async with AsyncWebCrawler(
proxy_config={
"server": "http://proxy.example.com:8080",
"username": "user",
"password": "pass"
},
headers={
"User-Agent": "Custom User Agent",
"Accept-Language": "en-US,en;q=0.9"
}
) as crawler:
result = await crawler.arun(url="https://example.com")
3. Content Selection & Filtering
The library offers multiple ways to select and filter content, from CSS selectors to word count thresholds.
# CSS Selector and Content Filtering
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
css_selector="article.main-content", # Extract specific content
word_count_threshold=10, # Minimum words per block
excluded_tags=['form', 'header'], # Tags to exclude
exclude_external_links=True, # Remove external links
exclude_social_media_links=True, # Remove social media links
exclude_domains=["pinterest.com", "facebook.com"] # Exclude specific domains
)
# Custom HTML to Text Options
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
html2text={
"escape_dot": False,
"links_each_paragraph": True,
"protect_links": True
}
)
4. Dynamic Content Handling
The library provides sophisticated handling of dynamic content with JavaScript execution and wait conditions.
# JavaScript Execution and Wait Conditions
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
js_code=[
"window.scrollTo(0, document.body.scrollHeight);",
"document.querySelector('.load-more').click();"
],
wait_for="css:.dynamic-content", # Wait for element
delay_before_return_html=2.0 # Wait after JS execution
)
# Smart Wait Conditions
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
wait_for="""() => {
return document.querySelectorAll('.item').length > 10;
}""",
page_timeout=60000 # 60 seconds timeout
)
5. Advanced Link Analysis
The library provides comprehensive link analysis capabilities, distinguishing between internal and external links, with options for filtering and processing.
# Basic Link Analysis
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com")
# Access internal and external links
for internal_link in result.links['internal']:
print(f"Internal: {internal_link['href']} - {internal_link['text']}")
for external_link in result.links['external']:
print(f"External: {external_link['href']} - {external_link['text']}")
# Advanced Link Filtering
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
exclude_external_links=True, # Remove all external links
exclude_social_media_links=True, # Remove social media links
exclude_social_media_domains=[ # Custom social media domains
"facebook.com", "twitter.com", "instagram.com"
],
exclude_domains=["pinterest.com"] # Specific domains to exclude
)
6. Anti-Bot Protection Handling
The library includes sophisticated anti-detection mechanisms to handle websites with bot protection.
# Basic Anti-Detection
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
simulate_user=True, # Simulate human behavior
override_navigator=True # Override navigator properties
)
# Advanced Anti-Detection with Magic Mode
async with AsyncWebCrawler(headless=False) as crawler:
result = await crawler.arun(
url="https://example.com",
magic=True, # Enable all anti-detection features
remove_overlay_elements=True, # Remove popups/modals automatically
# Custom navigator properties
js_code="""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
"""
)
7. Session Management
Session management allows maintaining state across multiple requests and handling cookies.
# Basic Session Management
async with AsyncWebCrawler() as crawler:
session_id = "my_session"
# Login
login_result = await crawler.arun(
url="https://example.com/login",
session_id=session_id,
js_code="document.querySelector('form').submit();"
)
# Use same session for subsequent requests
protected_result = await crawler.arun(
url="https://example.com/protected",
session_id=session_id
)
# Clean up session
await crawler.crawler_strategy.kill_session(session_id)
# Advanced Session with Custom Cookies
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
session_id="custom_session",
cookies=[{
"name": "sessionId",
"value": "abc123",
"domain": "example.com"
}]
)
8. Screenshot and Media Handling
The library provides comprehensive media handling capabilities, including screenshots and media content extraction.
# Screenshot Capture
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
screenshot=True,
screenshot_wait_for=2.0 # Wait before taking screenshot
)
# Save screenshot
if result.screenshot:
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))
# Media Extraction
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com")
# Process images with metadata
for image in result.media['images']:
print(f"Image: {image['src']}")
print(f"Alt text: {image['alt']}")
print(f"Context: {image['desc']}")
print(f"Relevance score: {image['score']}")
# Process videos and audio
for video in result.media['videos']:
print(f"Video: {video['src']}")
for audio in result.media['audios']:
print(f"Audio: {audio['src']}")
9. Structured Data Extraction & Chunking
The library supports multiple strategies for structured data extraction and content chunking.
# LLM-based Extraction
class NewsArticle(BaseModel):
title: str
content: str
author: str
extraction_strategy = LLMExtractionStrategy(
provider='openai/gpt-4',
api_token="your-token",
schema=NewsArticle.schema(),
instruction="Extract news article details",
chunk_token_threshold=1000,
overlap_rate=0.1
)
# CSS-based Extraction
schema = {
"name": "Product Listing",
"baseSelector": ".product-card",
"fields": [
{
"name": "title",
"selector": "h2",
"type": "text"
},
{
"name": "price",
"selector": ".price",
"type": "text",
"transform": "strip"
}
]
}
css_strategy = JsonCssExtractionStrategy(schema)
# Text Chunking
from crawl4ai.chunking_strategy import OverlappingWindowChunking
chunking_strategy = OverlappingWindowChunking(
window_size=1000,
overlap=100
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
extraction_strategy=extraction_strategy,
chunking_strategy=chunking_strategy
)
10. Content Cleaning & Processing
The library provides extensive content cleaning and processing capabilities, ensuring high-quality output in various formats.
# Basic Content Cleaning
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
remove_overlay_elements=True, # Remove popups/modals
process_iframes=True, # Process iframe content
word_count_threshold=10 # Minimum words per block
)
print(result.cleaned_html) # Clean HTML
print(result.fit_html) # Most relevant HTML content
print(result.fit_markdown) # Most relevant markdown content
# Advanced Content Processing
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
excluded_tags=['form', 'header', 'footer', 'nav'],
html2text={
"escape_dot": False,
"body_width": 0,
"protect_links": True,
"unicode_snob": True,
"ignore_links": False,
"ignore_images": False,
"ignore_emphasis": False,
"bypass_tables": False,
"ignore_tables": False
}
)
Advanced Usage Patterns
1. Combining Multiple Features
async with AsyncWebCrawler(
browser_type="chromium",
headless=False,
verbose=True
) as crawler:
result = await crawler.arun(
url="https://example.com",
# Anti-bot measures
magic=True,
simulate_user=True,
# Content selection
css_selector="article.main",
word_count_threshold=10,
# Dynamic content handling
js_code="window.scrollTo(0, document.body.scrollHeight);",
wait_for="css:.dynamic-content",
# Content filtering
exclude_external_links=True,
exclude_social_media_links=True,
# Media handling
screenshot=True,
process_iframes=True,
# Content cleaning
remove_overlay_elements=True
)
2. Custom Extraction Pipeline
# Define custom schemas and strategies
class Article(BaseModel):
title: str
content: str
date: str
# CSS extraction for initial content
css_schema = {
"name": "Article Extraction",
"baseSelector": "article",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "content", "selector": ".content", "type": "html"},
{"name": "date", "selector": ".date", "type": "text"}
]
}
# LLM processing for semantic analysis
llm_strategy = LLMExtractionStrategy(
provider="ollama/nemotron",
api_token="your-token",
schema=Article.schema(),
instruction="Extract and clean article content"
)
# Chunking strategy for large content
chunking = OverlappingWindowChunking(window_size=1000, overlap=100)
async with AsyncWebCrawler() as crawler:
# First pass: Extract structure
css_result = await crawler.arun(
url="https://example.com",
extraction_strategy=JsonCssExtractionStrategy(css_schema)
)
# Second pass: Semantic processing
llm_result = await crawler.arun(
url="https://example.com",
extraction_strategy=llm_strategy,
chunking_strategy=chunking
)
3. Website Crawling with Custom Processing
class CustomWebsiteCrawler:
def __init__(self, crawler: AsyncWebCrawler):
self.crawler = crawler
self.results = {}
async def process_page(self, url: str) -> Dict:
result = await self.crawler.arun(
url=url,
magic=True,
word_count_threshold=10,
exclude_external_links=True,
process_iframes=True,
remove_overlay_elements=True
)
# Process internal links
internal_links = [
link['href'] for link in result.links['internal']
if self._is_valid_link(link['href'])
]
# Extract media
media_urls = [img['src'] for img in result.media['images']]
return {
'content': result.markdown,
'links': internal_links,
'media': media_urls,
'metadata': result.metadata
}
async def crawl_website(self, start_url: str, max_depth: int = 2):
visited = set()
queue = [(start_url, 0)]
while queue:
url, depth = queue.pop(0)
if depth > max_depth or url in visited:
continue
visited.add(url)
self.results[url] = await self.process_page(url)