9.1 KiB
9.1 KiB
AsyncWebCrawler Constructor Parameters
AsyncWebCrawler(
# Core Browser Settings
browser_type: str = "chromium", # Options: "chromium", "firefox", "webkit"
headless: bool = True, # Whether to run browser in headless mode
verbose: bool = False, # Enable verbose logging
# Cache Settings
always_by_pass_cache: bool = False, # Always bypass cache regardless of run settings
base_directory: str = str(Path.home()), # Base directory for cache storage
# Network Settings
proxy: str = None, # Simple proxy URL (e.g., "http://proxy.example.com:8080")
proxy_config: Dict = None, # Advanced proxy settings with auth: {"server": str, "username": str, "password": str}
# Browser Behavior
sleep_on_close: bool = False, # Wait before closing browser
# Other Settings passed to AsyncPlaywrightCrawlerStrategy
user_agent: str = None, # Custom user agent string
headers: Dict[str, str] = {}, # Custom HTTP headers
js_code: Union[str, List[str]] = None, # Default JavaScript to execute
)
arun() Method Parameters
arun(
# Core Parameters
url: str, # Required: URL to crawl
# Content Selection
css_selector: str = None, # CSS selector to extract specific content
word_count_threshold: int = MIN_WORD_THRESHOLD, # Minimum words for content blocks
# Cache Control
bypass_cache: bool = False, # Bypass cache for this request
# Session Management
session_id: str = None, # Session identifier for persistent browsing
# Screenshot Options
screenshot: bool = False, # Take page screenshot
screenshot_wait_for: float = None, # Wait time before screenshot
# Content Processing
process_iframes: bool = False, # Process iframe content
remove_overlay_elements: bool = False, # Remove popups/modals
# Anti-Bot/Detection
simulate_user: bool = False, # Simulate human-like behavior
override_navigator: bool = False, # Override navigator properties
magic: bool = False, # Enable all anti-detection features
# Content Filtering
excluded_tags: List[str] = None, # HTML tags to exclude
exclude_external_links: bool = False, # Remove external links
exclude_social_media_links: bool = False, # Remove social media links
exclude_external_images: bool = False, # Remove external images
exclude_social_media_domains: List[str] = None, # Additional social media domains to exclude
remove_forms: bool = False, # Remove all form elements
# JavaScript Handling
js_code: Union[str, List[str]] = None, # JavaScript to execute
js_only: bool = False, # Only execute JavaScript without reloading page
wait_for: str = None, # Wait condition (CSS selector or JS function)
# Page Loading
page_timeout: int = 60000, # Page load timeout in milliseconds
delay_before_return_html: float = None, # Wait before returning HTML
# Debug Options
log_console: bool = False, # Log browser console messages
# Content Format Control
only_text: bool = False, # Extract only text content
keep_data_attributes: bool = False, # Keep data-* attributes in HTML
# Markdown Options
include_links_on_markdown: bool = False, # Include links in markdown output
html2text: Dict = {}, # HTML to text conversion options
# Extraction Strategy
extraction_strategy: ExtractionStrategy = None, # Strategy for structured data extraction
# Advanced Browser Control
user_agent: str = None, # Override user agent for this request
)
Extraction Strategy Parameters
# JsonCssExtractionStrategy
{
"name": str, # Name of extraction schema
"baseSelector": str, # Base CSS selector
"fields": [
{
"name": str, # Field name
"selector": str, # CSS selector
"type": str, # Data type ("text", etc.)
"transform": str = None # Optional transformation
}
]
}
# LLMExtractionStrategy
{
"provider": str, # LLM provider (e.g., "openai/gpt-4", "huggingface/...", "ollama/...")
"api_token": str, # API token
"schema": dict, # Pydantic model schema
"extraction_type": str, # Type of extraction ("schema", etc.)
"instruction": str, # Extraction instruction
"extra_args": dict = None, # Additional provider-specific arguments
"extra_headers": dict = None # Additional HTTP headers
}
HTML to Text Conversion Options (html2text parameter)
{
"escape_dot": bool = True, # Escape dots in text
# Other html2text library options
}
CrawlResult Fields
class CrawlResult(BaseModel):
# Basic Information
url: str # The crawled URL
# Example: "https://example.com"
success: bool # Whether the crawl was successful
# Example: True/False
status_code: Optional[int] # HTTP status code
# Example: 200, 404, 500
# Content Fields
html: str # Raw HTML content
# Example: "<html><body>...</body></html>"
cleaned_html: Optional[str] # HTML after cleaning and processing
# Example: "<article><p>Clean content...</p></article>"
fit_html: Optional[str] # Most relevant HTML content after content cleaning strategy
# Example: "<div><p>Most relevant content...</p></div>"
markdown: Optional[str] # HTML converted to markdown
# Example: "# Title\n\nContent paragraph..."
fit_markdown: Optional[str] # Most relevant content in markdown
# Example: "# Main Article\n\nKey content..."
# Media Content
media: Dict[str, List[Dict]] = {} # Extracted media information
# Example: {
# "images": [
# {
# "src": "https://example.com/image.jpg",
# "alt": "Image description",
# "desc": "Contextual description",
# "score": 5, # Relevance score
# "type": "image"
# }
# ],
# "videos": [
# {
# "src": "https://example.com/video.mp4",
# "alt": "Video title",
# "type": "video",
# "description": "Video context"
# }
# ],
# "audios": [
# {
# "src": "https://example.com/audio.mp3",
# "alt": "Audio title",
# "type": "audio",
# "description": "Audio context"
# }
# ]
# }
# Link Information
links: Dict[str, List[Dict]] = {} # Extracted links
# Example: {
# "internal": [
# {
# "href": "https://example.com/page",
# "text": "Link text",
# "title": "Link title"
# }
# ],
# "external": [
# {
# "href": "https://external.com",
# "text": "External link text",
# "title": "External link title"
# }
# ]
# }
# Extraction Results
extracted_content: Optional[str] # Content from extraction strategy
# Example for JsonCssExtractionStrategy:
# '[{"title": "Article 1", "date": "2024-03-20"}, ...]'
# Example for LLMExtractionStrategy:
# '{"entities": [...], "relationships": [...]}'
# Additional Information
metadata: Optional[dict] = None # Page metadata
# Example: {
# "title": "Page Title",
# "description": "Meta description",
# "keywords": ["keyword1", "keyword2"],
# "author": "Author Name",
# "published_date": "2024-03-20"
# }
screenshot: Optional[str] = None # Base64 encoded screenshot
# Example: "iVBORw0KGgoAAAANSUhEUgAA..."
error_message: Optional[str] = None # Error message if crawl failed
# Example: "Failed to load page: timeout"
session_id: Optional[str] = None # Session identifier
# Example: "session_123456"
response_headers: Optional[dict] = None # HTTP response headers
# Example: {
# "content-type": "text/html",
# "server": "nginx/1.18.0",
# "date": "Wed, 20 Mar 2024 12:00:00 GMT"
# }
Common Usage Patterns:
- Basic Content Extraction:
result = await crawler.arun(url="https://example.com")
print(result.markdown) # Clean, readable content
print(result.cleaned_html) # Cleaned HTML
- Media Analysis:
result = await crawler.arun(url="https://example.com")
for image in result.media["images"]:
if image["score"] > 3: # High-relevance images
print(f"High-quality image: {image['src']}")
- Link Analysis:
result = await crawler.arun(url="https://example.com")
internal_links = [link["href"] for link in result.links["internal"]]
external_links = [link["href"] for link in result.links["external"]]
- Structured Data Extraction:
result = await crawler.arun(
url="https://example.com",
extraction_strategy=my_strategy
)
structured_data = json.loads(result.extracted_content)
- Error Handling:
result = await crawler.arun(url="https://example.com")
if not result.success:
print(f"Crawl failed: {result.error_message}")
print(f"Status code: {result.status_code}")