This commit introduces significant enhancements to the Crawl4AI ecosystem: Chrome Extension - Script Builder (Alpha): - Add recording functionality to capture user interactions (clicks, typing, scrolling) - Implement smart event grouping for cleaner script generation - Support export to both JavaScript and C4A script formats - Add timeline view for visualizing and editing recorded actions - Include wait commands (time-based and element-based) - Add saved flows functionality for reusing automation scripts - Update UI with consistent dark terminal theme (Dank Mono font, green/pink accents) - Release new extension versions: v1.1.0, v1.2.0, v1.2.1 LLM Context Builder Improvements: - Reorganize context files from llmtxt/ to llm.txt/ with better structure - Separate diagram templates from text content (diagrams/ and txt/ subdirectories) - Add comprehensive context files for all major Crawl4AI components - Improve file naming convention for better discoverability Documentation Updates: - Update apps index page to match main documentation theme - Standardize color scheme: "Available" tags use primary color (#50ffff) - Change "Coming Soon" tags to dark gray for better visual hierarchy - Add interactive two-column layout for extension landing page - Include code examples for both Schema Builder and Script Builder features Technical Improvements: - Enhance event capture mechanism with better element selection - Add support for contenteditable elements and complex form interactions - Implement proper scroll event handling for both window and element scrolling - Add meta key support for keyboard shortcuts - Improve selector generation for more reliable element targeting The Script Builder is released as Alpha, acknowledging potential bugs while providing early access to this powerful automation recording feature.
1171 lines
39 KiB
Plaintext
1171 lines
39 KiB
Plaintext
## Browser, Crawler & LLM Configuration
|
|
|
|
Core configuration classes for controlling browser behavior, crawl operations, LLM providers, and understanding crawl results.
|
|
|
|
### BrowserConfig - Browser Environment Setup
|
|
|
|
```python
|
|
from crawl4ai import BrowserConfig, AsyncWebCrawler
|
|
|
|
# Basic browser configuration
|
|
browser_config = BrowserConfig(
|
|
browser_type="chromium", # "chromium", "firefox", "webkit"
|
|
headless=True, # False for visible browser (debugging)
|
|
viewport_width=1280,
|
|
viewport_height=720,
|
|
verbose=True
|
|
)
|
|
|
|
# Advanced browser setup with proxy and persistence
|
|
browser_config = BrowserConfig(
|
|
headless=False,
|
|
proxy="http://user:pass@proxy:8080",
|
|
use_persistent_context=True,
|
|
user_data_dir="./browser_data",
|
|
cookies=[
|
|
{"name": "session", "value": "abc123", "domain": "example.com"}
|
|
],
|
|
headers={"Accept-Language": "en-US,en;q=0.9"},
|
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/116.0.0.0 Safari/537.36",
|
|
text_mode=True, # Disable images for faster crawling
|
|
extra_args=["--disable-extensions", "--no-sandbox"]
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun("https://example.com")
|
|
```
|
|
|
|
### CrawlerRunConfig - Crawl Operation Control
|
|
|
|
```python
|
|
from crawl4ai import CrawlerRunConfig, CacheMode
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
|
|
|
# Basic crawl configuration
|
|
run_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
word_count_threshold=10,
|
|
excluded_tags=["nav", "footer", "script"],
|
|
exclude_external_links=True,
|
|
screenshot=True,
|
|
pdf=True
|
|
)
|
|
|
|
# Advanced content processing
|
|
md_generator = DefaultMarkdownGenerator(
|
|
content_filter=PruningContentFilter(threshold=0.6),
|
|
options={"citations": True, "ignore_links": False}
|
|
)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
# Content processing
|
|
markdown_generator=md_generator,
|
|
css_selector="main.content", # Focus on specific content
|
|
target_elements=[".article", ".post"], # Multiple target selectors
|
|
process_iframes=True,
|
|
remove_overlay_elements=True,
|
|
|
|
# Page interaction
|
|
js_code=[
|
|
"window.scrollTo(0, document.body.scrollHeight);",
|
|
"document.querySelector('.load-more')?.click();"
|
|
],
|
|
wait_for="css:.content-loaded",
|
|
wait_for_timeout=10000,
|
|
scan_full_page=True,
|
|
|
|
# Session management
|
|
session_id="persistent_session",
|
|
|
|
# Media handling
|
|
screenshot=True,
|
|
pdf=True,
|
|
capture_mhtml=True,
|
|
image_score_threshold=5,
|
|
|
|
# Advanced options
|
|
simulate_user=True,
|
|
magic=True, # Auto-handle popups
|
|
verbose=True
|
|
)
|
|
```
|
|
|
|
### CrawlerRunConfig Parameters by Category
|
|
|
|
```python
|
|
# Content Processing
|
|
config = CrawlerRunConfig(
|
|
word_count_threshold=10, # Min words per content block
|
|
css_selector="main.article", # Focus on specific content
|
|
target_elements=[".post", ".content"], # Multiple target selectors
|
|
excluded_tags=["nav", "footer"], # Remove these tags
|
|
excluded_selector="#ads, .tracker", # Remove by selector
|
|
only_text=True, # Text-only extraction
|
|
keep_data_attributes=True, # Preserve data-* attributes
|
|
remove_forms=True, # Remove all forms
|
|
process_iframes=True # Include iframe content
|
|
)
|
|
|
|
# Page Navigation & Timing
|
|
config = CrawlerRunConfig(
|
|
wait_until="networkidle", # Wait condition
|
|
page_timeout=60000, # 60 second timeout
|
|
wait_for="css:.loaded", # Wait for specific element
|
|
wait_for_images=True, # Wait for images to load
|
|
delay_before_return_html=0.5, # Final delay before capture
|
|
semaphore_count=10 # Max concurrent operations
|
|
)
|
|
|
|
# Page Interaction
|
|
config = CrawlerRunConfig(
|
|
js_code="document.querySelector('button').click();",
|
|
scan_full_page=True, # Auto-scroll page
|
|
scroll_delay=0.3, # Delay between scrolls
|
|
remove_overlay_elements=True, # Remove popups/modals
|
|
simulate_user=True, # Simulate human behavior
|
|
override_navigator=True, # Override navigator properties
|
|
magic=True # Auto-handle common patterns
|
|
)
|
|
|
|
# Caching & Session
|
|
config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, # Cache behavior
|
|
session_id="my_session", # Persistent session
|
|
shared_data={"context": "value"} # Share data between hooks
|
|
)
|
|
|
|
# Media & Output
|
|
config = CrawlerRunConfig(
|
|
screenshot=True, # Capture screenshot
|
|
pdf=True, # Generate PDF
|
|
capture_mhtml=True, # Capture MHTML archive
|
|
image_score_threshold=3, # Filter low-quality images
|
|
exclude_external_images=True # Remove external images
|
|
)
|
|
|
|
# Link & Domain Filtering
|
|
config = CrawlerRunConfig(
|
|
exclude_external_links=True, # Remove external links
|
|
exclude_social_media_links=True, # Remove social media links
|
|
exclude_domains=["ads.com", "tracker.io"], # Custom domain filter
|
|
exclude_internal_links=False # Keep internal links
|
|
)
|
|
```
|
|
|
|
### LLMConfig - Language Model Setup
|
|
|
|
```python
|
|
from crawl4ai import LLMConfig
|
|
|
|
# OpenAI configuration
|
|
llm_config = LLMConfig(
|
|
provider="openai/gpt-4o-mini",
|
|
api_token=os.getenv("OPENAI_API_KEY"), # or "env:OPENAI_API_KEY"
|
|
temperature=0.1,
|
|
max_tokens=2000
|
|
)
|
|
|
|
# Local model with Ollama
|
|
llm_config = LLMConfig(
|
|
provider="ollama/llama3.3",
|
|
api_token=None, # Not needed for Ollama
|
|
base_url="http://localhost:11434" # Custom endpoint
|
|
)
|
|
|
|
# Anthropic Claude
|
|
llm_config = LLMConfig(
|
|
provider="anthropic/claude-3-5-sonnet-20240620",
|
|
api_token="env:ANTHROPIC_API_KEY",
|
|
max_tokens=4000
|
|
)
|
|
|
|
# Google Gemini
|
|
llm_config = LLMConfig(
|
|
provider="gemini/gemini-1.5-pro",
|
|
api_token="env:GEMINI_API_KEY"
|
|
)
|
|
|
|
# Groq (fast inference)
|
|
llm_config = LLMConfig(
|
|
provider="groq/llama3-70b-8192",
|
|
api_token="env:GROQ_API_KEY"
|
|
)
|
|
```
|
|
|
|
### CrawlResult - Understanding Output
|
|
|
|
```python
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://example.com", config=run_config)
|
|
|
|
# Basic status information
|
|
print(f"Success: {result.success}")
|
|
print(f"Status: {result.status_code}")
|
|
print(f"URL: {result.url}")
|
|
|
|
if not result.success:
|
|
print(f"Error: {result.error_message}")
|
|
return
|
|
|
|
# HTML content variants
|
|
print(f"Original HTML: {len(result.html)} chars")
|
|
print(f"Cleaned HTML: {len(result.cleaned_html or '')} chars")
|
|
|
|
# Markdown output (MarkdownGenerationResult)
|
|
if result.markdown:
|
|
print(f"Raw markdown: {len(result.markdown.raw_markdown)} chars")
|
|
print(f"With citations: {len(result.markdown.markdown_with_citations)} chars")
|
|
|
|
# Filtered content (if content filter was used)
|
|
if result.markdown.fit_markdown:
|
|
print(f"Fit markdown: {len(result.markdown.fit_markdown)} chars")
|
|
print(f"Fit HTML: {len(result.markdown.fit_html)} chars")
|
|
|
|
# Extracted structured data
|
|
if result.extracted_content:
|
|
import json
|
|
data = json.loads(result.extracted_content)
|
|
print(f"Extracted {len(data)} items")
|
|
|
|
# Media and links
|
|
images = result.media.get("images", [])
|
|
print(f"Found {len(images)} images")
|
|
for img in images[:3]: # First 3 images
|
|
print(f" {img.get('src')} (score: {img.get('score', 0)})")
|
|
|
|
internal_links = result.links.get("internal", [])
|
|
external_links = result.links.get("external", [])
|
|
print(f"Links: {len(internal_links)} internal, {len(external_links)} external")
|
|
|
|
# Generated files
|
|
if result.screenshot:
|
|
print(f"Screenshot captured: {len(result.screenshot)} chars (base64)")
|
|
# Save screenshot
|
|
import base64
|
|
with open("page.png", "wb") as f:
|
|
f.write(base64.b64decode(result.screenshot))
|
|
|
|
if result.pdf:
|
|
print(f"PDF generated: {len(result.pdf)} bytes")
|
|
with open("page.pdf", "wb") as f:
|
|
f.write(result.pdf)
|
|
|
|
if result.mhtml:
|
|
print(f"MHTML captured: {len(result.mhtml)} chars")
|
|
with open("page.mhtml", "w", encoding="utf-8") as f:
|
|
f.write(result.mhtml)
|
|
|
|
# SSL certificate information
|
|
if result.ssl_certificate:
|
|
print(f"SSL Issuer: {result.ssl_certificate.issuer}")
|
|
print(f"Valid until: {result.ssl_certificate.valid_until}")
|
|
|
|
# Network and console data (if captured)
|
|
if result.network_requests:
|
|
requests = [r for r in result.network_requests if r.get("event_type") == "request"]
|
|
print(f"Network requests captured: {len(requests)}")
|
|
|
|
if result.console_messages:
|
|
errors = [m for m in result.console_messages if m.get("type") == "error"]
|
|
print(f"Console messages: {len(result.console_messages)} ({len(errors)} errors)")
|
|
|
|
# Session and metadata
|
|
if result.session_id:
|
|
print(f"Session ID: {result.session_id}")
|
|
|
|
if result.metadata:
|
|
print(f"Metadata: {result.metadata.get('title', 'No title')}")
|
|
```
|
|
|
|
### Configuration Helpers and Best Practices
|
|
|
|
```python
|
|
# Clone configurations for variations
|
|
base_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.ENABLED,
|
|
word_count_threshold=200,
|
|
verbose=True
|
|
)
|
|
|
|
# Create streaming version
|
|
stream_config = base_config.clone(
|
|
stream=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
# Create debug version
|
|
debug_config = base_config.clone(
|
|
headless=False,
|
|
page_timeout=120000,
|
|
verbose=True
|
|
)
|
|
|
|
# Serialize/deserialize configurations
|
|
config_dict = base_config.dump() # Convert to dict
|
|
restored_config = CrawlerRunConfig.load(config_dict) # Restore from dict
|
|
|
|
# Browser configuration management
|
|
browser_config = BrowserConfig(headless=True, text_mode=True)
|
|
browser_dict = browser_config.to_dict()
|
|
cloned_browser = browser_config.clone(headless=False, verbose=True)
|
|
```
|
|
|
|
### Common Configuration Patterns
|
|
|
|
```python
|
|
# Fast text-only crawling
|
|
fast_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.ENABLED,
|
|
text_mode=True,
|
|
exclude_external_links=True,
|
|
exclude_external_images=True,
|
|
word_count_threshold=50
|
|
)
|
|
|
|
# Comprehensive data extraction
|
|
comprehensive_config = CrawlerRunConfig(
|
|
process_iframes=True,
|
|
scan_full_page=True,
|
|
wait_for_images=True,
|
|
screenshot=True,
|
|
capture_network_requests=True,
|
|
capture_console_messages=True,
|
|
magic=True
|
|
)
|
|
|
|
# Stealth crawling
|
|
stealth_config = CrawlerRunConfig(
|
|
simulate_user=True,
|
|
override_navigator=True,
|
|
mean_delay=2.0,
|
|
max_range=1.0,
|
|
user_agent_mode="random"
|
|
)
|
|
```
|
|
|
|
### Advanced Configuration Features
|
|
|
|
#### User Agent Management & Bot Detection Avoidance
|
|
|
|
```python
|
|
from crawl4ai import CrawlerRunConfig
|
|
|
|
# Random user agent generation
|
|
config = CrawlerRunConfig(
|
|
user_agent_mode="random",
|
|
user_agent_generator_config={
|
|
"platform": "windows", # "windows", "macos", "linux", "android", "ios"
|
|
"browser": "chrome", # "chrome", "firefox", "safari", "edge"
|
|
"device_type": "desktop" # "desktop", "mobile", "tablet"
|
|
}
|
|
)
|
|
|
|
# Custom user agent with stealth features
|
|
config = CrawlerRunConfig(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
simulate_user=True, # Simulate human mouse movements
|
|
override_navigator=True, # Override navigator properties
|
|
mean_delay=1.5, # Random delays between actions
|
|
max_range=2.0
|
|
)
|
|
|
|
# Combined anti-detection approach
|
|
stealth_config = CrawlerRunConfig(
|
|
user_agent_mode="random",
|
|
simulate_user=True,
|
|
override_navigator=True,
|
|
magic=True, # Auto-handle common bot detection patterns
|
|
delay_before_return_html=2.0
|
|
)
|
|
```
|
|
|
|
#### Proxy Configuration with ProxyConfig
|
|
|
|
```python
|
|
from crawl4ai import CrawlerRunConfig, ProxyConfig, ProxyRotationStrategy
|
|
|
|
# Single proxy configuration
|
|
proxy_config = ProxyConfig(
|
|
server="http://proxy.example.com:8080",
|
|
username="proxy_user",
|
|
password="proxy_pass"
|
|
)
|
|
|
|
# From proxy string format
|
|
proxy_config = ProxyConfig.from_string("192.168.1.100:8080:username:password")
|
|
|
|
# Multiple proxies with rotation
|
|
proxies = [
|
|
ProxyConfig(server="http://proxy1.com:8080", username="user1", password="pass1"),
|
|
ProxyConfig(server="http://proxy2.com:8080", username="user2", password="pass2"),
|
|
ProxyConfig(server="http://proxy3.com:8080", username="user3", password="pass3")
|
|
]
|
|
|
|
rotation_strategy = ProxyRotationStrategy(
|
|
proxies=proxies,
|
|
rotation_method="round_robin" # or "random", "least_used"
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
proxy_config=proxy_config,
|
|
proxy_rotation_strategy=rotation_strategy
|
|
)
|
|
|
|
# Load proxies from environment variable
|
|
proxies_from_env = ProxyConfig.from_env("MY_PROXIES") # comma-separated proxy strings
|
|
```
|
|
|
|
#### Content Selection: css_selector vs target_elements
|
|
|
|
```python
|
|
from crawl4ai import CrawlerRunConfig
|
|
|
|
# css_selector: Extracts HTML at top level, affects entire processing
|
|
config = CrawlerRunConfig(
|
|
css_selector="main.article, .content-area", # Can be list of selectors
|
|
# Everything else (markdown, extraction, links) works only on this HTML subset
|
|
)
|
|
|
|
# target_elements: Focuses extraction within already processed HTML
|
|
config = CrawlerRunConfig(
|
|
css_selector="body", # First extract entire body
|
|
target_elements=[ # Then focus extraction on these elements
|
|
".article-content",
|
|
".post-body",
|
|
".main-text"
|
|
],
|
|
# Links, media from entire body, but markdown/extraction only from target_elements
|
|
)
|
|
|
|
# Hierarchical content selection
|
|
config = CrawlerRunConfig(
|
|
css_selector=["#main-content", ".article-wrapper"], # Top-level extraction
|
|
target_elements=[ # Subset for processing
|
|
".article-title",
|
|
".article-body",
|
|
".article-metadata"
|
|
],
|
|
excluded_selector="#sidebar, .ads, .comments" # Remove these from selection
|
|
)
|
|
```
|
|
|
|
#### Advanced wait_for Conditions
|
|
|
|
```python
|
|
from crawl4ai import CrawlerRunConfig
|
|
|
|
# CSS selector waiting
|
|
config = CrawlerRunConfig(
|
|
wait_for="css:.content-loaded", # Wait for element to appear
|
|
wait_for_timeout=15000
|
|
)
|
|
|
|
# JavaScript boolean expression waiting
|
|
config = CrawlerRunConfig(
|
|
wait_for="js:() => window.dataLoaded === true", # Custom JS condition
|
|
wait_for_timeout=20000
|
|
)
|
|
|
|
# Complex JavaScript conditions
|
|
config = CrawlerRunConfig(
|
|
wait_for="js:() => document.querySelectorAll('.item').length >= 10",
|
|
js_code=[
|
|
"document.querySelector('.load-more')?.click();",
|
|
"window.scrollTo(0, document.body.scrollHeight);"
|
|
]
|
|
)
|
|
|
|
# Multiple conditions with JavaScript
|
|
config = CrawlerRunConfig(
|
|
wait_for="js:() => !document.querySelector('.loading') && document.querySelector('.results')",
|
|
page_timeout=30000
|
|
)
|
|
```
|
|
|
|
#### Session Management for Multi-Step Crawling
|
|
|
|
```python
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
|
|
|
# Persistent session across multiple arun() calls
|
|
async def multi_step_crawling():
|
|
async with AsyncWebCrawler() as crawler:
|
|
# Step 1: Login page
|
|
login_config = CrawlerRunConfig(
|
|
session_id="user_session", # Create persistent session
|
|
js_code="document.querySelector('#username').value = 'user'; document.querySelector('#password').value = 'pass'; document.querySelector('#login').click();",
|
|
wait_for="css:.dashboard",
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
result1 = await crawler.arun("https://example.com/login", config=login_config)
|
|
|
|
# Step 2: Navigate to protected area (reuses same browser page)
|
|
nav_config = CrawlerRunConfig(
|
|
session_id="user_session", # Same session = same browser page
|
|
js_only=True, # No page reload, just JS navigation
|
|
js_code="window.location.href = '/dashboard/data';",
|
|
wait_for="css:.data-table"
|
|
)
|
|
|
|
result2 = await crawler.arun("https://example.com/dashboard/data", config=nav_config)
|
|
|
|
# Step 3: Extract data from multiple pages
|
|
for page in range(1, 6):
|
|
page_config = CrawlerRunConfig(
|
|
session_id="user_session",
|
|
js_only=True,
|
|
js_code=f"document.querySelector('.page-{page}').click();",
|
|
wait_for=f"js:() => document.querySelector('.page-{page}').classList.contains('active')"
|
|
)
|
|
|
|
result = await crawler.arun(f"https://example.com/data/page/{page}", config=page_config)
|
|
print(f"Page {page} data extracted: {len(result.extracted_content)}")
|
|
|
|
# Important: Kill session when done
|
|
await crawler.kill_session("user_session")
|
|
|
|
# Session with shared data between steps
|
|
async def session_with_shared_data():
|
|
shared_context = {"user_id": "12345", "preferences": {"theme": "dark"}}
|
|
|
|
config = CrawlerRunConfig(
|
|
session_id="persistent_session",
|
|
shared_data=shared_context, # Available across all session calls
|
|
js_code="console.log('User ID:', window.sharedData.user_id);"
|
|
)
|
|
```
|
|
|
|
#### Identity-Based Crawling Parameters
|
|
|
|
```python
|
|
from crawl4ai import CrawlerRunConfig, GeolocationConfig
|
|
|
|
# Locale and timezone simulation
|
|
config = CrawlerRunConfig(
|
|
locale="en-US", # Browser language preference
|
|
timezone_id="America/New_York", # Timezone setting
|
|
user_agent_mode="random",
|
|
user_agent_generator_config={
|
|
"platform": "windows",
|
|
"locale": "en-US"
|
|
}
|
|
)
|
|
|
|
# Geolocation simulation
|
|
geo_config = GeolocationConfig(
|
|
latitude=40.7128, # New York coordinates
|
|
longitude=-74.0060,
|
|
accuracy=100.0
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
geolocation=geo_config,
|
|
locale="en-US",
|
|
timezone_id="America/New_York"
|
|
)
|
|
|
|
# Complete identity simulation
|
|
identity_config = CrawlerRunConfig(
|
|
# Location identity
|
|
locale="fr-FR",
|
|
timezone_id="Europe/Paris",
|
|
geolocation=GeolocationConfig(latitude=48.8566, longitude=2.3522),
|
|
|
|
# Browser identity
|
|
user_agent_mode="random",
|
|
user_agent_generator_config={
|
|
"platform": "windows",
|
|
"locale": "fr-FR",
|
|
"browser": "chrome"
|
|
},
|
|
|
|
# Behavioral identity
|
|
simulate_user=True,
|
|
override_navigator=True,
|
|
mean_delay=2.0,
|
|
max_range=1.5
|
|
)
|
|
```
|
|
|
|
#### Simplified Import Pattern
|
|
|
|
```python
|
|
# Almost everything from crawl4ai main package
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
BrowserConfig,
|
|
CrawlerRunConfig,
|
|
LLMConfig,
|
|
CacheMode,
|
|
ProxyConfig,
|
|
GeolocationConfig
|
|
)
|
|
|
|
# Specialized strategies (still from crawl4ai)
|
|
from crawl4ai import (
|
|
JsonCssExtractionStrategy,
|
|
LLMExtractionStrategy,
|
|
DefaultMarkdownGenerator,
|
|
PruningContentFilter,
|
|
RegexChunking
|
|
)
|
|
|
|
# Complete example with simplified imports
|
|
async def example_crawl():
|
|
browser_config = BrowserConfig(headless=True)
|
|
|
|
run_config = CrawlerRunConfig(
|
|
user_agent_mode="random",
|
|
proxy_config=ProxyConfig.from_string("192.168.1.1:8080:user:pass"),
|
|
css_selector="main.content",
|
|
target_elements=[".article", ".post"],
|
|
wait_for="js:() => document.querySelector('.loaded')",
|
|
session_id="my_session",
|
|
simulate_user=True
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun("https://example.com", config=run_config)
|
|
return result
|
|
```
|
|
|
|
## Advanced Features
|
|
|
|
Comprehensive guide to advanced crawling capabilities including file handling, authentication, dynamic content, monitoring, and session management.
|
|
|
|
### File Download Handling
|
|
|
|
```python
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
import os
|
|
|
|
# Enable downloads with custom path
|
|
downloads_path = os.path.join(os.getcwd(), "my_downloads")
|
|
os.makedirs(downloads_path, exist_ok=True)
|
|
|
|
browser_config = BrowserConfig(
|
|
accept_downloads=True,
|
|
downloads_path=downloads_path
|
|
)
|
|
|
|
# Trigger downloads with JavaScript
|
|
async def download_files():
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
config = CrawlerRunConfig(
|
|
js_code="""
|
|
// Click download links
|
|
const downloadLinks = document.querySelectorAll('a[href$=".pdf"]');
|
|
for (const link of downloadLinks) {
|
|
link.click();
|
|
await new Promise(r => setTimeout(r, 2000)); // Delay between downloads
|
|
}
|
|
""",
|
|
wait_for=5 # Wait for downloads to start
|
|
)
|
|
|
|
result = await crawler.arun("https://example.com/downloads", config=config)
|
|
|
|
if result.downloaded_files:
|
|
print("Downloaded files:")
|
|
for file_path in result.downloaded_files:
|
|
print(f"- {file_path} ({os.path.getsize(file_path)} bytes)")
|
|
```
|
|
|
|
### Hooks & Authentication
|
|
|
|
```python
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
from playwright.async_api import Page, BrowserContext
|
|
|
|
async def advanced_crawler_with_hooks():
|
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
|
crawler = AsyncWebCrawler(config=browser_config)
|
|
|
|
# Hook functions for different stages
|
|
async def on_browser_created(browser, **kwargs):
|
|
print("[HOOK] Browser created successfully")
|
|
return browser
|
|
|
|
async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
|
|
print("[HOOK] Setting up page & context")
|
|
|
|
# Block images for faster crawling
|
|
async def route_filter(route):
|
|
if route.request.resource_type == "image":
|
|
await route.abort()
|
|
else:
|
|
await route.continue_()
|
|
|
|
await context.route("**", route_filter)
|
|
|
|
# Simulate login if needed
|
|
# await page.goto("https://example.com/login")
|
|
# await page.fill("input[name='username']", "testuser")
|
|
# await page.fill("input[name='password']", "password123")
|
|
# await page.click("button[type='submit']")
|
|
|
|
await page.set_viewport_size({"width": 1080, "height": 600})
|
|
return page
|
|
|
|
async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
|
|
print(f"[HOOK] About to navigate to: {url}")
|
|
await page.set_extra_http_headers({"Custom-Header": "my-value"})
|
|
return page
|
|
|
|
async def after_goto(page: Page, context: BrowserContext, url: str, response, **kwargs):
|
|
print(f"[HOOK] Successfully loaded: {url}")
|
|
try:
|
|
await page.wait_for_selector('.content', timeout=1000)
|
|
print("[HOOK] Content found!")
|
|
except:
|
|
print("[HOOK] Content not found, continuing")
|
|
return page
|
|
|
|
async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
|
|
print("[HOOK] Final actions before HTML retrieval")
|
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
|
return page
|
|
|
|
# Attach hooks
|
|
crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
|
|
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
|
|
crawler.crawler_strategy.set_hook("before_goto", before_goto)
|
|
crawler.crawler_strategy.set_hook("after_goto", after_goto)
|
|
crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
|
|
|
|
await crawler.start()
|
|
|
|
config = CrawlerRunConfig()
|
|
result = await crawler.arun("https://example.com", config=config)
|
|
|
|
if result.success:
|
|
print(f"Crawled successfully: {len(result.html)} chars")
|
|
|
|
await crawler.close()
|
|
```
|
|
|
|
### Lazy Loading & Dynamic Content
|
|
|
|
```python
|
|
# Handle lazy-loaded images and infinite scroll
|
|
async def handle_lazy_loading():
|
|
config = CrawlerRunConfig(
|
|
# Wait for images to fully load
|
|
wait_for_images=True,
|
|
|
|
# Automatically scroll entire page to trigger lazy loading
|
|
scan_full_page=True,
|
|
scroll_delay=0.5, # Delay between scroll steps
|
|
|
|
# JavaScript for custom lazy loading
|
|
js_code="""
|
|
// Scroll and wait for content to load
|
|
window.scrollTo(0, document.body.scrollHeight);
|
|
|
|
// Click "Load More" if available
|
|
const loadMoreBtn = document.querySelector('.load-more');
|
|
if (loadMoreBtn) {
|
|
loadMoreBtn.click();
|
|
}
|
|
""",
|
|
|
|
# Wait for specific content to appear
|
|
wait_for="css:.lazy-content:nth-child(20)", # Wait for 20 items
|
|
|
|
# Exclude external images to focus on main content
|
|
exclude_external_images=True
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://example.com/gallery", config=config)
|
|
|
|
if result.success:
|
|
images = result.media.get("images", [])
|
|
print(f"Loaded {len(images)} images after lazy loading")
|
|
for img in images[:3]:
|
|
print(f"- {img.get('src')} (score: {img.get('score', 'N/A')})")
|
|
```
|
|
|
|
### Network & Console Monitoring
|
|
|
|
```python
|
|
# Capture all network requests and console messages for debugging
|
|
async def monitor_network_and_console():
|
|
config = CrawlerRunConfig(
|
|
capture_network_requests=True,
|
|
capture_console_messages=True
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://example.com", config=config)
|
|
|
|
if result.success:
|
|
# Analyze network requests
|
|
if result.network_requests:
|
|
requests = [r for r in result.network_requests if r.get("event_type") == "request"]
|
|
responses = [r for r in result.network_requests if r.get("event_type") == "response"]
|
|
failures = [r for r in result.network_requests if r.get("event_type") == "request_failed"]
|
|
|
|
print(f"Network activity: {len(requests)} requests, {len(responses)} responses, {len(failures)} failures")
|
|
|
|
# Find API calls
|
|
api_calls = [r for r in requests if "api" in r.get("url", "")]
|
|
print(f"API calls detected: {len(api_calls)}")
|
|
|
|
# Show failed requests
|
|
for failure in failures[:3]:
|
|
print(f"Failed: {failure.get('url')} - {failure.get('failure_text')}")
|
|
|
|
# Analyze console messages
|
|
if result.console_messages:
|
|
message_types = {}
|
|
for msg in result.console_messages:
|
|
msg_type = msg.get("type", "unknown")
|
|
message_types[msg_type] = message_types.get(msg_type, 0) + 1
|
|
|
|
print(f"Console messages: {message_types}")
|
|
|
|
# Show errors
|
|
errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
|
|
for error in errors[:2]:
|
|
print(f"JS Error: {error.get('text', '')[:100]}")
|
|
```
|
|
|
|
### Session Management for Multi-Step Workflows
|
|
|
|
```python
|
|
# Maintain state across multiple requests for complex workflows
|
|
async def multi_step_session_workflow():
|
|
session_id = "workflow_session"
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
# Step 1: Initial page load
|
|
config1 = CrawlerRunConfig(
|
|
session_id=session_id,
|
|
wait_for="css:.content-loaded"
|
|
)
|
|
|
|
result1 = await crawler.arun("https://example.com/step1", config=config1)
|
|
print("Step 1 completed")
|
|
|
|
# Step 2: Navigate and interact (same browser tab)
|
|
config2 = CrawlerRunConfig(
|
|
session_id=session_id,
|
|
js_only=True, # Don't reload page, just run JS
|
|
js_code="""
|
|
document.querySelector('#next-button').click();
|
|
""",
|
|
wait_for="css:.step2-content"
|
|
)
|
|
|
|
result2 = await crawler.arun("https://example.com/step2", config=config2)
|
|
print("Step 2 completed")
|
|
|
|
# Step 3: Form submission
|
|
config3 = CrawlerRunConfig(
|
|
session_id=session_id,
|
|
js_only=True,
|
|
js_code="""
|
|
document.querySelector('#form-field').value = 'test data';
|
|
document.querySelector('#submit-btn').click();
|
|
""",
|
|
wait_for="css:.results"
|
|
)
|
|
|
|
result3 = await crawler.arun("https://example.com/submit", config=config3)
|
|
print("Step 3 completed")
|
|
|
|
# Clean up session
|
|
await crawler.crawler_strategy.kill_session(session_id)
|
|
|
|
# Advanced GitHub commits pagination example
|
|
async def github_commits_pagination():
|
|
session_id = "github_session"
|
|
all_commits = []
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
for page in range(3):
|
|
if page == 0:
|
|
# Initial load
|
|
config = CrawlerRunConfig(
|
|
session_id=session_id,
|
|
wait_for="js:() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"
|
|
)
|
|
else:
|
|
# Navigate to next page
|
|
config = CrawlerRunConfig(
|
|
session_id=session_id,
|
|
js_only=True,
|
|
js_code='document.querySelector(\'a[data-testid="pagination-next-button"]\').click();',
|
|
wait_for="js:() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"
|
|
)
|
|
|
|
result = await crawler.arun(
|
|
"https://github.com/microsoft/TypeScript/commits/main",
|
|
config=config
|
|
)
|
|
|
|
if result.success:
|
|
commit_count = result.cleaned_html.count('li.Box-sc-g0xbh4-0')
|
|
print(f"Page {page + 1}: Found {commit_count} commits")
|
|
|
|
await crawler.crawler_strategy.kill_session(session_id)
|
|
```
|
|
|
|
### SSL Certificate Analysis
|
|
|
|
```python
|
|
# Fetch and analyze SSL certificates
|
|
async def analyze_ssl_certificates():
|
|
config = CrawlerRunConfig(
|
|
fetch_ssl_certificate=True
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://example.com", config=config)
|
|
|
|
if result.success and result.ssl_certificate:
|
|
cert = result.ssl_certificate
|
|
|
|
# Basic certificate info
|
|
print(f"Issuer: {cert.issuer.get('CN', 'Unknown')}")
|
|
print(f"Subject: {cert.subject.get('CN', 'Unknown')}")
|
|
print(f"Valid from: {cert.valid_from}")
|
|
print(f"Valid until: {cert.valid_until}")
|
|
print(f"Fingerprint: {cert.fingerprint}")
|
|
|
|
# Export certificate in different formats
|
|
import os
|
|
os.makedirs("certificates", exist_ok=True)
|
|
|
|
cert.to_json("certificates/cert.json")
|
|
cert.to_pem("certificates/cert.pem")
|
|
cert.to_der("certificates/cert.der")
|
|
|
|
print("Certificate exported in multiple formats")
|
|
```
|
|
|
|
### Advanced Page Interaction
|
|
|
|
```python
|
|
# Complex page interactions with dynamic content
|
|
async def advanced_page_interaction():
|
|
async with AsyncWebCrawler() as crawler:
|
|
# Multi-step interaction with waiting
|
|
config = CrawlerRunConfig(
|
|
js_code=[
|
|
# Step 1: Scroll to load content
|
|
"window.scrollTo(0, document.body.scrollHeight);",
|
|
|
|
# Step 2: Wait and click load more
|
|
"""
|
|
(async () => {
|
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
const loadMore = document.querySelector('.load-more');
|
|
if (loadMore) loadMore.click();
|
|
})();
|
|
"""
|
|
],
|
|
|
|
# Wait for new content to appear
|
|
wait_for="js:() => document.querySelectorAll('.item').length > 20",
|
|
|
|
# Additional timing controls
|
|
page_timeout=60000, # 60 second timeout
|
|
delay_before_return_html=2.0, # Wait before final capture
|
|
|
|
# Handle overlays automatically
|
|
remove_overlay_elements=True,
|
|
magic=True, # Auto-handle common popup patterns
|
|
|
|
# Simulate human behavior
|
|
simulate_user=True,
|
|
override_navigator=True
|
|
)
|
|
|
|
result = await crawler.arun("https://example.com/dynamic", config=config)
|
|
|
|
if result.success:
|
|
print(f"Interactive crawl completed: {len(result.cleaned_html)} chars")
|
|
|
|
# Form interaction example
|
|
async def form_interaction_example():
|
|
config = CrawlerRunConfig(
|
|
js_code="""
|
|
// Fill search form
|
|
document.querySelector('#search-input').value = 'machine learning';
|
|
document.querySelector('#category-select').value = 'technology';
|
|
document.querySelector('#search-form').submit();
|
|
""",
|
|
wait_for="css:.search-results",
|
|
session_id="search_session"
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://example.com/search", config=config)
|
|
print("Search completed, results loaded")
|
|
```
|
|
|
|
### Local File & Raw HTML Processing
|
|
|
|
```python
|
|
# Handle different input types: URLs, local files, raw HTML
|
|
async def handle_different_inputs():
|
|
async with AsyncWebCrawler() as crawler:
|
|
# 1. Regular web URL
|
|
result1 = await crawler.arun("https://example.com")
|
|
|
|
# 2. Local HTML file
|
|
local_file_path = "/path/to/file.html"
|
|
result2 = await crawler.arun(f"file://{local_file_path}")
|
|
|
|
# 3. Raw HTML content
|
|
raw_html = "<html><body><h1>Test Content</h1><p>Sample text</p></body></html>"
|
|
result3 = await crawler.arun(f"raw:{raw_html}")
|
|
|
|
# All return the same CrawlResult structure
|
|
for i, result in enumerate([result1, result2, result3], 1):
|
|
if result.success:
|
|
print(f"Input {i}: {len(result.markdown)} chars of markdown")
|
|
|
|
# Save and re-process HTML example
|
|
async def save_and_reprocess():
|
|
async with AsyncWebCrawler() as crawler:
|
|
# Original crawl
|
|
result = await crawler.arun("https://example.com")
|
|
|
|
if result.success:
|
|
# Save HTML to file
|
|
with open("saved_page.html", "w", encoding="utf-8") as f:
|
|
f.write(result.html)
|
|
|
|
# Re-process from file
|
|
file_result = await crawler.arun("file://./saved_page.html")
|
|
|
|
# Process as raw HTML
|
|
raw_result = await crawler.arun(f"raw:{result.html}")
|
|
|
|
# Verify consistency
|
|
assert len(result.markdown) == len(file_result.markdown) == len(raw_result.markdown)
|
|
print("✅ All processing methods produced identical results")
|
|
```
|
|
|
|
### Advanced Link & Media Handling
|
|
|
|
```python
|
|
# Comprehensive link and media extraction with filtering
|
|
async def advanced_link_media_handling():
|
|
config = CrawlerRunConfig(
|
|
# Link filtering
|
|
exclude_external_links=False, # Keep external links for analysis
|
|
exclude_social_media_links=True,
|
|
exclude_domains=["ads.com", "tracker.io", "spammy.net"],
|
|
|
|
# Media handling
|
|
exclude_external_images=True,
|
|
image_score_threshold=5, # Only high-quality images
|
|
table_score_threshold=7, # Only well-structured tables
|
|
wait_for_images=True,
|
|
|
|
# Capture additional formats
|
|
screenshot=True,
|
|
pdf=True,
|
|
capture_mhtml=True # Full page archive
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://example.com", config=config)
|
|
|
|
if result.success:
|
|
# Analyze links
|
|
internal_links = result.links.get("internal", [])
|
|
external_links = result.links.get("external", [])
|
|
print(f"Links: {len(internal_links)} internal, {len(external_links)} external")
|
|
|
|
# Analyze media
|
|
images = result.media.get("images", [])
|
|
tables = result.media.get("tables", [])
|
|
print(f"Media: {len(images)} images, {len(tables)} tables")
|
|
|
|
# High-quality images only
|
|
quality_images = [img for img in images if img.get("score", 0) >= 5]
|
|
print(f"High-quality images: {len(quality_images)}")
|
|
|
|
# Table analysis
|
|
for i, table in enumerate(tables[:2]):
|
|
print(f"Table {i+1}: {len(table.get('headers', []))} columns, {len(table.get('rows', []))} rows")
|
|
|
|
# Save captured files
|
|
if result.screenshot:
|
|
import base64
|
|
with open("page_screenshot.png", "wb") as f:
|
|
f.write(base64.b64decode(result.screenshot))
|
|
|
|
if result.pdf:
|
|
with open("page.pdf", "wb") as f:
|
|
f.write(result.pdf)
|
|
|
|
if result.mhtml:
|
|
with open("page_archive.mhtml", "w", encoding="utf-8") as f:
|
|
f.write(result.mhtml)
|
|
|
|
print("Additional formats saved: screenshot, PDF, MHTML archive")
|
|
```
|
|
|
|
### Performance & Resource Management
|
|
|
|
```python
|
|
# Optimize performance for large-scale crawling
|
|
async def performance_optimized_crawling():
|
|
# Lightweight browser config
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
text_mode=True, # Disable images for speed
|
|
light_mode=True, # Reduce background features
|
|
extra_args=["--disable-extensions", "--no-sandbox"]
|
|
)
|
|
|
|
# Efficient crawl config
|
|
config = CrawlerRunConfig(
|
|
# Content filtering for speed
|
|
excluded_tags=["script", "style", "nav", "footer"],
|
|
exclude_external_links=True,
|
|
exclude_all_images=True, # Remove all images for max speed
|
|
word_count_threshold=50,
|
|
|
|
# Timing optimizations
|
|
page_timeout=30000, # Faster timeout
|
|
delay_before_return_html=0.1,
|
|
|
|
# Resource monitoring
|
|
capture_network_requests=False, # Disable unless needed
|
|
capture_console_messages=False,
|
|
|
|
# Cache for repeated URLs
|
|
cache_mode=CacheMode.ENABLED
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
urls = ["https://example.com/page1", "https://example.com/page2", "https://example.com/page3"]
|
|
|
|
# Efficient batch processing
|
|
batch_config = config.clone(
|
|
stream=True, # Stream results as they complete
|
|
semaphore_count=3 # Control concurrency
|
|
)
|
|
|
|
async for result in await crawler.arun_many(urls, config=batch_config):
|
|
if result.success:
|
|
print(f"✅ {result.url}: {len(result.markdown)} chars")
|
|
else:
|
|
print(f"❌ {result.url}: {result.error_message}")
|
|
```
|
|
|
|
|
|
**📖 Learn more:** [Complete Parameter Reference](https://docs.crawl4ai.com/api/parameters/), [Content Filtering](https://docs.crawl4ai.com/core/markdown-generation/), [Session Management](https://docs.crawl4ai.com/advanced/session-management/), [Network Capture](https://docs.crawl4ai.com/advanced/network-console-capture/)
|
|
|
|
**📖 Learn more:** [Hooks & Authentication](https://docs.crawl4ai.com/advanced/hooks-auth/), [Session Management](https://docs.crawl4ai.com/advanced/session-management/), [Network Monitoring](https://docs.crawl4ai.com/advanced/network-console-capture/), [Page Interaction](https://docs.crawl4ai.com/core/page-interaction/), [File Downloads](https://docs.crawl4ai.com/advanced/file-downloading/) |