458 lines
14 KiB
Markdown
458 lines
14 KiB
Markdown
I'll expand the outline with detailed descriptions and examples based on all the provided files. I'll start with the first few sections:
|
|
|
|
### 1. Basic Web Crawling
|
|
Basic web crawling provides the foundation for extracting content from websites. The library supports both simple single-page crawling and recursive website crawling.
|
|
|
|
```python
|
|
# Simple page crawling
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(url="https://example.com")
|
|
print(result.html) # Raw HTML
|
|
print(result.markdown) # Cleaned markdown
|
|
print(result.cleaned_html) # Cleaned HTML
|
|
|
|
# Recursive website crawling
|
|
class SimpleWebsiteScraper:
|
|
def __init__(self, crawler: AsyncWebCrawler):
|
|
self.crawler = crawler
|
|
|
|
async def scrape(self, start_url: str, max_depth: int):
|
|
results = await self.scrape_recursive(start_url, max_depth)
|
|
return results
|
|
|
|
# Usage
|
|
async with AsyncWebCrawler() as crawler:
|
|
scraper = SimpleWebsiteScraper(crawler)
|
|
results = await scraper.scrape("https://example.com", depth=2)
|
|
```
|
|
|
|
### 2. Browser Control Options
|
|
The library provides extensive control over browser behavior, allowing customization of browser type, headless mode, and proxy settings.
|
|
|
|
```python
|
|
# Browser Type Selection
|
|
async with AsyncWebCrawler(
|
|
browser_type="firefox", # Options: "chromium", "firefox", "webkit"
|
|
headless=False, # For visible browser
|
|
verbose=True # Enable logging
|
|
) as crawler:
|
|
result = await crawler.arun(url="https://example.com")
|
|
|
|
# Proxy Configuration
|
|
async with AsyncWebCrawler(
|
|
proxy_config={
|
|
"server": "http://proxy.example.com:8080",
|
|
"username": "user",
|
|
"password": "pass"
|
|
},
|
|
headers={
|
|
"User-Agent": "Custom User Agent",
|
|
"Accept-Language": "en-US,en;q=0.9"
|
|
}
|
|
) as crawler:
|
|
result = await crawler.arun(url="https://example.com")
|
|
```
|
|
|
|
### 3. Content Selection & Filtering
|
|
The library offers multiple ways to select and filter content, from CSS selectors to word count thresholds.
|
|
|
|
```python
|
|
# CSS Selector and Content Filtering
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
css_selector="article.main-content", # Extract specific content
|
|
word_count_threshold=10, # Minimum words per block
|
|
excluded_tags=['form', 'header'], # Tags to exclude
|
|
exclude_external_links=True, # Remove external links
|
|
exclude_social_media_links=True, # Remove social media links
|
|
exclude_domains=["pinterest.com", "facebook.com"] # Exclude specific domains
|
|
)
|
|
|
|
# Custom HTML to Text Options
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
html2text={
|
|
"escape_dot": False,
|
|
"links_each_paragraph": True,
|
|
"protect_links": True
|
|
}
|
|
)
|
|
```
|
|
|
|
### 4. Dynamic Content Handling
|
|
The library provides sophisticated handling of dynamic content with JavaScript execution and wait conditions.
|
|
|
|
```python
|
|
# JavaScript Execution and Wait Conditions
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
js_code=[
|
|
"window.scrollTo(0, document.body.scrollHeight);",
|
|
"document.querySelector('.load-more').click();"
|
|
],
|
|
wait_for="css:.dynamic-content", # Wait for element
|
|
delay_before_return_html=2.0 # Wait after JS execution
|
|
)
|
|
|
|
# Smart Wait Conditions
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
wait_for="""() => {
|
|
return document.querySelectorAll('.item').length > 10;
|
|
}""",
|
|
page_timeout=60000 # 60 seconds timeout
|
|
)
|
|
```
|
|
|
|
### 5. Advanced Link Analysis
|
|
The library provides comprehensive link analysis capabilities, distinguishing between internal and external links, with options for filtering and processing.
|
|
|
|
```python
|
|
# Basic Link Analysis
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(url="https://example.com")
|
|
|
|
# Access internal and external links
|
|
for internal_link in result.links['internal']:
|
|
print(f"Internal: {internal_link['href']} - {internal_link['text']}")
|
|
|
|
for external_link in result.links['external']:
|
|
print(f"External: {external_link['href']} - {external_link['text']}")
|
|
|
|
# Advanced Link Filtering
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
exclude_external_links=True, # Remove all external links
|
|
exclude_social_media_links=True, # Remove social media links
|
|
exclude_social_media_domains=[ # Custom social media domains
|
|
"facebook.com", "twitter.com", "instagram.com"
|
|
],
|
|
exclude_domains=["pinterest.com"] # Specific domains to exclude
|
|
)
|
|
```
|
|
|
|
### 6. Anti-Bot Protection Handling
|
|
The library includes sophisticated anti-detection mechanisms to handle websites with bot protection.
|
|
|
|
```python
|
|
# Basic Anti-Detection
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
simulate_user=True, # Simulate human behavior
|
|
override_navigator=True # Override navigator properties
|
|
)
|
|
|
|
# Advanced Anti-Detection with Magic Mode
|
|
async with AsyncWebCrawler(headless=False) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
magic=True, # Enable all anti-detection features
|
|
remove_overlay_elements=True, # Remove popups/modals automatically
|
|
# Custom navigator properties
|
|
js_code="""
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => undefined
|
|
});
|
|
"""
|
|
)
|
|
```
|
|
|
|
### 7. Session Management
|
|
Session management allows maintaining state across multiple requests and handling cookies.
|
|
|
|
```python
|
|
# Basic Session Management
|
|
async with AsyncWebCrawler() as crawler:
|
|
session_id = "my_session"
|
|
|
|
# Login
|
|
login_result = await crawler.arun(
|
|
url="https://example.com/login",
|
|
session_id=session_id,
|
|
js_code="document.querySelector('form').submit();"
|
|
)
|
|
|
|
# Use same session for subsequent requests
|
|
protected_result = await crawler.arun(
|
|
url="https://example.com/protected",
|
|
session_id=session_id
|
|
)
|
|
|
|
# Clean up session
|
|
await crawler.crawler_strategy.kill_session(session_id)
|
|
|
|
# Advanced Session with Custom Cookies
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
session_id="custom_session",
|
|
cookies=[{
|
|
"name": "sessionId",
|
|
"value": "abc123",
|
|
"domain": "example.com"
|
|
}]
|
|
)
|
|
```
|
|
|
|
### 8. Screenshot and Media Handling
|
|
The library provides comprehensive media handling capabilities, including screenshots and media content extraction.
|
|
|
|
```python
|
|
# Screenshot Capture
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
screenshot=True,
|
|
screenshot_wait_for=2.0 # Wait before taking screenshot
|
|
)
|
|
|
|
# Save screenshot
|
|
if result.screenshot:
|
|
with open("screenshot.png", "wb") as f:
|
|
f.write(base64.b64decode(result.screenshot))
|
|
|
|
# Media Extraction
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(url="https://example.com")
|
|
|
|
# Process images with metadata
|
|
for image in result.media['images']:
|
|
print(f"Image: {image['src']}")
|
|
print(f"Alt text: {image['alt']}")
|
|
print(f"Context: {image['desc']}")
|
|
print(f"Relevance score: {image['score']}")
|
|
|
|
# Process videos and audio
|
|
for video in result.media['videos']:
|
|
print(f"Video: {video['src']}")
|
|
for audio in result.media['audios']:
|
|
print(f"Audio: {audio['src']}")
|
|
```
|
|
|
|
### 9. Structured Data Extraction & Chunking
|
|
The library supports multiple strategies for structured data extraction and content chunking.
|
|
|
|
```python
|
|
# LLM-based Extraction
|
|
class NewsArticle(BaseModel):
|
|
title: str
|
|
content: str
|
|
author: str
|
|
|
|
extraction_strategy = LLMExtractionStrategy(
|
|
provider='openai/gpt-4',
|
|
api_token="your-token",
|
|
schema=NewsArticle.schema(),
|
|
instruction="Extract news article details",
|
|
chunk_token_threshold=1000,
|
|
overlap_rate=0.1
|
|
)
|
|
|
|
# CSS-based Extraction
|
|
schema = {
|
|
"name": "Product Listing",
|
|
"baseSelector": ".product-card",
|
|
"fields": [
|
|
{
|
|
"name": "title",
|
|
"selector": "h2",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "price",
|
|
"selector": ".price",
|
|
"type": "text",
|
|
"transform": "strip"
|
|
}
|
|
]
|
|
}
|
|
|
|
css_strategy = JsonCssExtractionStrategy(schema)
|
|
|
|
# Text Chunking
|
|
from crawl4ai.chunking_strategy import OverlappingWindowChunking
|
|
|
|
chunking_strategy = OverlappingWindowChunking(
|
|
window_size=1000,
|
|
overlap=100
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
extraction_strategy=extraction_strategy,
|
|
chunking_strategy=chunking_strategy
|
|
)
|
|
```
|
|
|
|
|
|
### 10. Content Cleaning & Processing
|
|
The library provides extensive content cleaning and processing capabilities, ensuring high-quality output in various formats.
|
|
|
|
```python
|
|
# Basic Content Cleaning
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
remove_overlay_elements=True, # Remove popups/modals
|
|
process_iframes=True, # Process iframe content
|
|
word_count_threshold=10 # Minimum words per block
|
|
)
|
|
|
|
print(result.cleaned_html) # Clean HTML
|
|
print(result.fit_html) # Most relevant HTML content
|
|
print(result.fit_markdown) # Most relevant markdown content
|
|
|
|
# Advanced Content Processing
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
excluded_tags=['form', 'header', 'footer', 'nav'],
|
|
html2text={
|
|
"escape_dot": False,
|
|
"body_width": 0,
|
|
"protect_links": True,
|
|
"unicode_snob": True,
|
|
"ignore_links": False,
|
|
"ignore_images": False,
|
|
"ignore_emphasis": False,
|
|
"bypass_tables": False,
|
|
"ignore_tables": False
|
|
}
|
|
)
|
|
```
|
|
|
|
### Advanced Usage Patterns
|
|
|
|
#### 1. Combining Multiple Features
|
|
```python
|
|
async with AsyncWebCrawler(
|
|
browser_type="chromium",
|
|
headless=False,
|
|
verbose=True
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
# Anti-bot measures
|
|
magic=True,
|
|
simulate_user=True,
|
|
|
|
# Content selection
|
|
css_selector="article.main",
|
|
word_count_threshold=10,
|
|
|
|
# Dynamic content handling
|
|
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
|
wait_for="css:.dynamic-content",
|
|
|
|
# Content filtering
|
|
exclude_external_links=True,
|
|
exclude_social_media_links=True,
|
|
|
|
# Media handling
|
|
screenshot=True,
|
|
process_iframes=True,
|
|
|
|
# Content cleaning
|
|
remove_overlay_elements=True
|
|
)
|
|
```
|
|
|
|
#### 2. Custom Extraction Pipeline
|
|
```python
|
|
# Define custom schemas and strategies
|
|
class Article(BaseModel):
|
|
title: str
|
|
content: str
|
|
date: str
|
|
|
|
# CSS extraction for initial content
|
|
css_schema = {
|
|
"name": "Article Extraction",
|
|
"baseSelector": "article",
|
|
"fields": [
|
|
{"name": "title", "selector": "h1", "type": "text"},
|
|
{"name": "content", "selector": ".content", "type": "html"},
|
|
{"name": "date", "selector": ".date", "type": "text"}
|
|
]
|
|
}
|
|
|
|
# LLM processing for semantic analysis
|
|
llm_strategy = LLMExtractionStrategy(
|
|
provider="ollama/nemotron",
|
|
api_token="your-token",
|
|
schema=Article.schema(),
|
|
instruction="Extract and clean article content"
|
|
)
|
|
|
|
# Chunking strategy for large content
|
|
chunking = OverlappingWindowChunking(window_size=1000, overlap=100)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
# First pass: Extract structure
|
|
css_result = await crawler.arun(
|
|
url="https://example.com",
|
|
extraction_strategy=JsonCssExtractionStrategy(css_schema)
|
|
)
|
|
|
|
# Second pass: Semantic processing
|
|
llm_result = await crawler.arun(
|
|
url="https://example.com",
|
|
extraction_strategy=llm_strategy,
|
|
chunking_strategy=chunking
|
|
)
|
|
```
|
|
|
|
#### 3. Website Crawling with Custom Processing
|
|
```python
|
|
class CustomWebsiteCrawler:
|
|
def __init__(self, crawler: AsyncWebCrawler):
|
|
self.crawler = crawler
|
|
self.results = {}
|
|
|
|
async def process_page(self, url: str) -> Dict:
|
|
result = await self.crawler.arun(
|
|
url=url,
|
|
magic=True,
|
|
word_count_threshold=10,
|
|
exclude_external_links=True,
|
|
process_iframes=True,
|
|
remove_overlay_elements=True
|
|
)
|
|
|
|
# Process internal links
|
|
internal_links = [
|
|
link['href'] for link in result.links['internal']
|
|
if self._is_valid_link(link['href'])
|
|
]
|
|
|
|
# Extract media
|
|
media_urls = [img['src'] for img in result.media['images']]
|
|
|
|
return {
|
|
'content': result.markdown,
|
|
'links': internal_links,
|
|
'media': media_urls,
|
|
'metadata': result.metadata
|
|
}
|
|
|
|
async def crawl_website(self, start_url: str, max_depth: int = 2):
|
|
visited = set()
|
|
queue = [(start_url, 0)]
|
|
|
|
while queue:
|
|
url, depth = queue.pop(0)
|
|
if depth > max_depth or url in visited:
|
|
continue
|
|
|
|
visited.add(url)
|
|
self.results[url] = await self.process_page(url)
|
|
```
|
|
|