Update Documentation
This commit is contained in:
457
docs/details/features_details.md
Normal file
457
docs/details/features_details.md
Normal file
@@ -0,0 +1,457 @@
|
||||
I'll expand the outline with detailed descriptions and examples based on all the provided files. I'll start with the first few sections:
|
||||
|
||||
### 1. Basic Web Crawling
|
||||
Basic web crawling provides the foundation for extracting content from websites. The library supports both simple single-page crawling and recursive website crawling.
|
||||
|
||||
```python
|
||||
# Simple page crawling
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
print(result.html) # Raw HTML
|
||||
print(result.markdown) # Cleaned markdown
|
||||
print(result.cleaned_html) # Cleaned HTML
|
||||
|
||||
# Recursive website crawling
|
||||
class SimpleWebsiteScraper:
|
||||
def __init__(self, crawler: AsyncWebCrawler):
|
||||
self.crawler = crawler
|
||||
|
||||
async def scrape(self, start_url: str, max_depth: int):
|
||||
results = await self.scrape_recursive(start_url, max_depth)
|
||||
return results
|
||||
|
||||
# Usage
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
scraper = SimpleWebsiteScraper(crawler)
|
||||
results = await scraper.scrape("https://example.com", depth=2)
|
||||
```
|
||||
|
||||
### 2. Browser Control Options
|
||||
The library provides extensive control over browser behavior, allowing customization of browser type, headless mode, and proxy settings.
|
||||
|
||||
```python
|
||||
# Browser Type Selection
|
||||
async with AsyncWebCrawler(
|
||||
browser_type="firefox", # Options: "chromium", "firefox", "webkit"
|
||||
headless=False, # For visible browser
|
||||
verbose=True # Enable logging
|
||||
) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
|
||||
# Proxy Configuration
|
||||
async with AsyncWebCrawler(
|
||||
proxy_config={
|
||||
"server": "http://proxy.example.com:8080",
|
||||
"username": "user",
|
||||
"password": "pass"
|
||||
},
|
||||
headers={
|
||||
"User-Agent": "Custom User Agent",
|
||||
"Accept-Language": "en-US,en;q=0.9"
|
||||
}
|
||||
) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
|
||||
### 3. Content Selection & Filtering
|
||||
The library offers multiple ways to select and filter content, from CSS selectors to word count thresholds.
|
||||
|
||||
```python
|
||||
# CSS Selector and Content Filtering
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
css_selector="article.main-content", # Extract specific content
|
||||
word_count_threshold=10, # Minimum words per block
|
||||
excluded_tags=['form', 'header'], # Tags to exclude
|
||||
exclude_external_links=True, # Remove external links
|
||||
exclude_social_media_links=True, # Remove social media links
|
||||
exclude_domains=["pinterest.com", "facebook.com"] # Exclude specific domains
|
||||
)
|
||||
|
||||
# Custom HTML to Text Options
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
html2text={
|
||||
"escape_dot": False,
|
||||
"links_each_paragraph": True,
|
||||
"protect_links": True
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Dynamic Content Handling
|
||||
The library provides sophisticated handling of dynamic content with JavaScript execution and wait conditions.
|
||||
|
||||
```python
|
||||
# JavaScript Execution and Wait Conditions
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
js_code=[
|
||||
"window.scrollTo(0, document.body.scrollHeight);",
|
||||
"document.querySelector('.load-more').click();"
|
||||
],
|
||||
wait_for="css:.dynamic-content", # Wait for element
|
||||
delay_before_return_html=2.0 # Wait after JS execution
|
||||
)
|
||||
|
||||
# Smart Wait Conditions
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
wait_for="""() => {
|
||||
return document.querySelectorAll('.item').length > 10;
|
||||
}""",
|
||||
page_timeout=60000 # 60 seconds timeout
|
||||
)
|
||||
```
|
||||
|
||||
### 5. Advanced Link Analysis
|
||||
The library provides comprehensive link analysis capabilities, distinguishing between internal and external links, with options for filtering and processing.
|
||||
|
||||
```python
|
||||
# Basic Link Analysis
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
|
||||
# Access internal and external links
|
||||
for internal_link in result.links['internal']:
|
||||
print(f"Internal: {internal_link['href']} - {internal_link['text']}")
|
||||
|
||||
for external_link in result.links['external']:
|
||||
print(f"External: {external_link['href']} - {external_link['text']}")
|
||||
|
||||
# Advanced Link Filtering
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
exclude_external_links=True, # Remove all external links
|
||||
exclude_social_media_links=True, # Remove social media links
|
||||
exclude_social_media_domains=[ # Custom social media domains
|
||||
"facebook.com", "twitter.com", "instagram.com"
|
||||
],
|
||||
exclude_domains=["pinterest.com"] # Specific domains to exclude
|
||||
)
|
||||
```
|
||||
|
||||
### 6. Anti-Bot Protection Handling
|
||||
The library includes sophisticated anti-detection mechanisms to handle websites with bot protection.
|
||||
|
||||
```python
|
||||
# Basic Anti-Detection
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
simulate_user=True, # Simulate human behavior
|
||||
override_navigator=True # Override navigator properties
|
||||
)
|
||||
|
||||
# Advanced Anti-Detection with Magic Mode
|
||||
async with AsyncWebCrawler(headless=False) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
magic=True, # Enable all anti-detection features
|
||||
remove_overlay_elements=True, # Remove popups/modals automatically
|
||||
# Custom navigator properties
|
||||
js_code="""
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
});
|
||||
"""
|
||||
)
|
||||
```
|
||||
|
||||
### 7. Session Management
|
||||
Session management allows maintaining state across multiple requests and handling cookies.
|
||||
|
||||
```python
|
||||
# Basic Session Management
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
session_id = "my_session"
|
||||
|
||||
# Login
|
||||
login_result = await crawler.arun(
|
||||
url="https://example.com/login",
|
||||
session_id=session_id,
|
||||
js_code="document.querySelector('form').submit();"
|
||||
)
|
||||
|
||||
# Use same session for subsequent requests
|
||||
protected_result = await crawler.arun(
|
||||
url="https://example.com/protected",
|
||||
session_id=session_id
|
||||
)
|
||||
|
||||
# Clean up session
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
|
||||
# Advanced Session with Custom Cookies
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
session_id="custom_session",
|
||||
cookies=[{
|
||||
"name": "sessionId",
|
||||
"value": "abc123",
|
||||
"domain": "example.com"
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
### 8. Screenshot and Media Handling
|
||||
The library provides comprehensive media handling capabilities, including screenshots and media content extraction.
|
||||
|
||||
```python
|
||||
# Screenshot Capture
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
screenshot=True,
|
||||
screenshot_wait_for=2.0 # Wait before taking screenshot
|
||||
)
|
||||
|
||||
# Save screenshot
|
||||
if result.screenshot:
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
|
||||
# Media Extraction
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
|
||||
# Process images with metadata
|
||||
for image in result.media['images']:
|
||||
print(f"Image: {image['src']}")
|
||||
print(f"Alt text: {image['alt']}")
|
||||
print(f"Context: {image['desc']}")
|
||||
print(f"Relevance score: {image['score']}")
|
||||
|
||||
# Process videos and audio
|
||||
for video in result.media['videos']:
|
||||
print(f"Video: {video['src']}")
|
||||
for audio in result.media['audios']:
|
||||
print(f"Audio: {audio['src']}")
|
||||
```
|
||||
|
||||
### 9. Structured Data Extraction & Chunking
|
||||
The library supports multiple strategies for structured data extraction and content chunking.
|
||||
|
||||
```python
|
||||
# LLM-based Extraction
|
||||
class NewsArticle(BaseModel):
|
||||
title: str
|
||||
content: str
|
||||
author: str
|
||||
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
provider='openai/gpt-4',
|
||||
api_token="your-token",
|
||||
schema=NewsArticle.schema(),
|
||||
instruction="Extract news article details",
|
||||
chunk_token_threshold=1000,
|
||||
overlap_rate=0.1
|
||||
)
|
||||
|
||||
# CSS-based Extraction
|
||||
schema = {
|
||||
"name": "Product Listing",
|
||||
"baseSelector": ".product-card",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".price",
|
||||
"type": "text",
|
||||
"transform": "strip"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
css_strategy = JsonCssExtractionStrategy(schema)
|
||||
|
||||
# Text Chunking
|
||||
from crawl4ai.chunking_strategy import OverlappingWindowChunking
|
||||
|
||||
chunking_strategy = OverlappingWindowChunking(
|
||||
window_size=1000,
|
||||
overlap=100
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=chunking_strategy
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
### 10. Content Cleaning & Processing
|
||||
The library provides extensive content cleaning and processing capabilities, ensuring high-quality output in various formats.
|
||||
|
||||
```python
|
||||
# Basic Content Cleaning
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
remove_overlay_elements=True, # Remove popups/modals
|
||||
process_iframes=True, # Process iframe content
|
||||
word_count_threshold=10 # Minimum words per block
|
||||
)
|
||||
|
||||
print(result.cleaned_html) # Clean HTML
|
||||
print(result.fit_html) # Most relevant HTML content
|
||||
print(result.fit_markdown) # Most relevant markdown content
|
||||
|
||||
# Advanced Content Processing
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
excluded_tags=['form', 'header', 'footer', 'nav'],
|
||||
html2text={
|
||||
"escape_dot": False,
|
||||
"body_width": 0,
|
||||
"protect_links": True,
|
||||
"unicode_snob": True,
|
||||
"ignore_links": False,
|
||||
"ignore_images": False,
|
||||
"ignore_emphasis": False,
|
||||
"bypass_tables": False,
|
||||
"ignore_tables": False
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
### Advanced Usage Patterns
|
||||
|
||||
#### 1. Combining Multiple Features
|
||||
```python
|
||||
async with AsyncWebCrawler(
|
||||
browser_type="chromium",
|
||||
headless=False,
|
||||
verbose=True
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
# Anti-bot measures
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
|
||||
# Content selection
|
||||
css_selector="article.main",
|
||||
word_count_threshold=10,
|
||||
|
||||
# Dynamic content handling
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||
wait_for="css:.dynamic-content",
|
||||
|
||||
# Content filtering
|
||||
exclude_external_links=True,
|
||||
exclude_social_media_links=True,
|
||||
|
||||
# Media handling
|
||||
screenshot=True,
|
||||
process_iframes=True,
|
||||
|
||||
# Content cleaning
|
||||
remove_overlay_elements=True
|
||||
)
|
||||
```
|
||||
|
||||
#### 2. Custom Extraction Pipeline
|
||||
```python
|
||||
# Define custom schemas and strategies
|
||||
class Article(BaseModel):
|
||||
title: str
|
||||
content: str
|
||||
date: str
|
||||
|
||||
# CSS extraction for initial content
|
||||
css_schema = {
|
||||
"name": "Article Extraction",
|
||||
"baseSelector": "article",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "content", "selector": ".content", "type": "html"},
|
||||
{"name": "date", "selector": ".date", "type": "text"}
|
||||
]
|
||||
}
|
||||
|
||||
# LLM processing for semantic analysis
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
provider="ollama/nemotron",
|
||||
api_token="your-token",
|
||||
schema=Article.schema(),
|
||||
instruction="Extract and clean article content"
|
||||
)
|
||||
|
||||
# Chunking strategy for large content
|
||||
chunking = OverlappingWindowChunking(window_size=1000, overlap=100)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# First pass: Extract structure
|
||||
css_result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
extraction_strategy=JsonCssExtractionStrategy(css_schema)
|
||||
)
|
||||
|
||||
# Second pass: Semantic processing
|
||||
llm_result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
extraction_strategy=llm_strategy,
|
||||
chunking_strategy=chunking
|
||||
)
|
||||
```
|
||||
|
||||
#### 3. Website Crawling with Custom Processing
|
||||
```python
|
||||
class CustomWebsiteCrawler:
|
||||
def __init__(self, crawler: AsyncWebCrawler):
|
||||
self.crawler = crawler
|
||||
self.results = {}
|
||||
|
||||
async def process_page(self, url: str) -> Dict:
|
||||
result = await self.crawler.arun(
|
||||
url=url,
|
||||
magic=True,
|
||||
word_count_threshold=10,
|
||||
exclude_external_links=True,
|
||||
process_iframes=True,
|
||||
remove_overlay_elements=True
|
||||
)
|
||||
|
||||
# Process internal links
|
||||
internal_links = [
|
||||
link['href'] for link in result.links['internal']
|
||||
if self._is_valid_link(link['href'])
|
||||
]
|
||||
|
||||
# Extract media
|
||||
media_urls = [img['src'] for img in result.media['images']]
|
||||
|
||||
return {
|
||||
'content': result.markdown,
|
||||
'links': internal_links,
|
||||
'media': media_urls,
|
||||
'metadata': result.metadata
|
||||
}
|
||||
|
||||
async def crawl_website(self, start_url: str, max_depth: int = 2):
|
||||
visited = set()
|
||||
queue = [(start_url, 0)]
|
||||
|
||||
while queue:
|
||||
url, depth = queue.pop(0)
|
||||
if depth > max_depth or url in visited:
|
||||
continue
|
||||
|
||||
visited.add(url)
|
||||
self.results[url] = await self.process_page(url)
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user