Files
crawl4ai/docs/details/features.md
2024-10-27 19:24:46 +08:00

4.0 KiB

1. Basic Web Crawling

async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(url="https://example.com")
    print(result.markdown)  # Get clean markdown content
    print(result.html)      # Get raw HTML
    print(result.cleaned_html)  # Get cleaned HTML

2. Browser Control Options

  • Multiple Browser Support
# Choose between different browser engines
crawler = AsyncWebCrawler(browser_type="firefox")  # or "chromium", "webkit"
crawler = AsyncWebCrawler(headless=False)  # For visible browser
  • Proxy Configuration
crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080")
# Or with authentication
crawler = AsyncWebCrawler(proxy_config={
    "server": "http://proxy.example.com:8080",
    "username": "user",
    "password": "pass"
})

3. Content Selection & Filtering

  • CSS Selector Support
result = await crawler.arun(
    url="https://example.com",
    css_selector=".main-content"  # Extract specific content
)
  • Content Filtering Options
result = await crawler.arun(
    url="https://example.com",
    word_count_threshold=10,  # Minimum words per block
    excluded_tags=['form', 'header'],  # Tags to exclude
    exclude_external_links=True,  # Remove external links
    exclude_social_media_links=True,  # Remove social media links
    exclude_external_images=True  # Remove external images
)

4. Dynamic Content Handling

  • JavaScript Execution
result = await crawler.arun(
    url="https://example.com",
    js_code="window.scrollTo(0, document.body.scrollHeight)"  # Execute custom JS
)
  • Wait Conditions
result = await crawler.arun(
    url="https://example.com",
    wait_for="css:.my-element",  # Wait for element
    wait_for="js:() => document.readyState === 'complete'"  # Wait for condition
)

5. Anti-Bot Protection Handling

result = await crawler.arun(
    url="https://example.com",
    simulate_user=True,  # Simulate human behavior
    override_navigator=True,  # Mask automation signals
    magic=True  # Enable all anti-detection features
)

6. Session Management

session_id = "my_session"
result1 = await crawler.arun(url="https://example.com/page1", session_id=session_id)
result2 = await crawler.arun(url="https://example.com/page2", session_id=session_id)
await crawler.crawler_strategy.kill_session(session_id)

7. Media Handling

  • Screenshot Capture
result = await crawler.arun(
    url="https://example.com",
    screenshot=True
)
base64_screenshot = result.screenshot
  • Media Extraction
result = await crawler.arun(url="https://example.com")
print(result.media['images'])  # List of images
print(result.media['videos'])  # List of videos
print(result.media['audios'])  # List of audio files

8. Structured Data Extraction

  • CSS-based Extraction
schema = {
    "name": "News Articles",
    "baseSelector": "article",
    "fields": [
        {"name": "title", "selector": "h1", "type": "text"},
        {"name": "date", "selector": ".date", "type": "text"}
    ]
}
extraction_strategy = JsonCssExtractionStrategy(schema)
result = await crawler.arun(
    url="https://example.com",
    extraction_strategy=extraction_strategy
)
structured_data = json.loads(result.extracted_content)
  • LLM-based Extraction (Multiple Providers)
class NewsArticle(BaseModel):
    title: str
    summary: str

strategy = LLMExtractionStrategy(
    provider="ollama/nemotron",  # or "huggingface/...", "ollama/..."
    api_token="your-token",
    schema=NewsArticle.schema(),
    instruction="Extract news article details..."
)
result = await crawler.arun(
    url="https://example.com",
    extraction_strategy=strategy
)

9. Content Cleaning & Processing

result = await crawler.arun(
    url="https://example.com",
    remove_overlay_elements=True,  # Remove popups/modals
    process_iframes=True,  # Process iframe content
)
print(result.fit_markdown)  # Get most relevant content
print(result.fit_html)     # Get cleaned HTML