Files
crawl4ai/docs/details/features_details.md
2024-10-27 19:24:46 +08:00

14 KiB

I'll expand the outline with detailed descriptions and examples based on all the provided files. I'll start with the first few sections:

1. Basic Web Crawling

Basic web crawling provides the foundation for extracting content from websites. The library supports both simple single-page crawling and recursive website crawling.

# Simple page crawling
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(url="https://example.com")
    print(result.html)        # Raw HTML
    print(result.markdown)    # Cleaned markdown
    print(result.cleaned_html)  # Cleaned HTML

# Recursive website crawling
class SimpleWebsiteScraper:
    def __init__(self, crawler: AsyncWebCrawler):
        self.crawler = crawler

    async def scrape(self, start_url: str, max_depth: int):
        results = await self.scrape_recursive(start_url, max_depth)
        return results

# Usage
async with AsyncWebCrawler() as crawler:
    scraper = SimpleWebsiteScraper(crawler)
    results = await scraper.scrape("https://example.com", depth=2)

2. Browser Control Options

The library provides extensive control over browser behavior, allowing customization of browser type, headless mode, and proxy settings.

# Browser Type Selection
async with AsyncWebCrawler(
    browser_type="firefox",  # Options: "chromium", "firefox", "webkit"
    headless=False,         # For visible browser
    verbose=True           # Enable logging
) as crawler:
    result = await crawler.arun(url="https://example.com")

# Proxy Configuration
async with AsyncWebCrawler(
    proxy_config={
        "server": "http://proxy.example.com:8080",
        "username": "user",
        "password": "pass"
    },
    headers={
        "User-Agent": "Custom User Agent",
        "Accept-Language": "en-US,en;q=0.9"
    }
) as crawler:
    result = await crawler.arun(url="https://example.com")

3. Content Selection & Filtering

The library offers multiple ways to select and filter content, from CSS selectors to word count thresholds.

# CSS Selector and Content Filtering
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://example.com",
        css_selector="article.main-content",  # Extract specific content
        word_count_threshold=10,              # Minimum words per block
        excluded_tags=['form', 'header'],     # Tags to exclude
        exclude_external_links=True,          # Remove external links
        exclude_social_media_links=True,      # Remove social media links
        exclude_domains=["pinterest.com", "facebook.com"]  # Exclude specific domains
    )

# Custom HTML to Text Options
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://example.com",
        html2text={
            "escape_dot": False,
            "links_each_paragraph": True,
            "protect_links": True
        }
    )

4. Dynamic Content Handling

The library provides sophisticated handling of dynamic content with JavaScript execution and wait conditions.

# JavaScript Execution and Wait Conditions
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://example.com",
        js_code=[
            "window.scrollTo(0, document.body.scrollHeight);",
            "document.querySelector('.load-more').click();"
        ],
        wait_for="css:.dynamic-content",  # Wait for element
        delay_before_return_html=2.0      # Wait after JS execution
    )

# Smart Wait Conditions
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://example.com",
        wait_for="""() => {
            return document.querySelectorAll('.item').length > 10;
        }""",
        page_timeout=60000  # 60 seconds timeout
    )

The library provides comprehensive link analysis capabilities, distinguishing between internal and external links, with options for filtering and processing.

# Basic Link Analysis
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(url="https://example.com")
    
    # Access internal and external links
    for internal_link in result.links['internal']:
        print(f"Internal: {internal_link['href']} - {internal_link['text']}")
    
    for external_link in result.links['external']:
        print(f"External: {external_link['href']} - {external_link['text']}")

# Advanced Link Filtering
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://example.com",
        exclude_external_links=True,          # Remove all external links
        exclude_social_media_links=True,      # Remove social media links
        exclude_social_media_domains=[                # Custom social media domains
            "facebook.com", "twitter.com", "instagram.com"
        ],
        exclude_domains=["pinterest.com"]     # Specific domains to exclude
    )

6. Anti-Bot Protection Handling

The library includes sophisticated anti-detection mechanisms to handle websites with bot protection.

# Basic Anti-Detection
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://example.com",
        simulate_user=True,        # Simulate human behavior
        override_navigator=True    # Override navigator properties
    )

# Advanced Anti-Detection with Magic Mode
async with AsyncWebCrawler(headless=False) as crawler:
    result = await crawler.arun(
        url="https://example.com",
        magic=True,               # Enable all anti-detection features
        remove_overlay_elements=True,  # Remove popups/modals automatically
        # Custom navigator properties
        js_code="""
        Object.defineProperty(navigator, 'webdriver', {
            get: () => undefined
        });
        """
    )

7. Session Management

Session management allows maintaining state across multiple requests and handling cookies.

# Basic Session Management
async with AsyncWebCrawler() as crawler:
    session_id = "my_session"
    
    # Login
    login_result = await crawler.arun(
        url="https://example.com/login",
        session_id=session_id,
        js_code="document.querySelector('form').submit();"
    )
    
    # Use same session for subsequent requests
    protected_result = await crawler.arun(
        url="https://example.com/protected",
        session_id=session_id
    )
    
    # Clean up session
    await crawler.crawler_strategy.kill_session(session_id)

# Advanced Session with Custom Cookies
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://example.com",
        session_id="custom_session",
        cookies=[{
            "name": "sessionId",
            "value": "abc123",
            "domain": "example.com"
        }]
    )

8. Screenshot and Media Handling

The library provides comprehensive media handling capabilities, including screenshots and media content extraction.

# Screenshot Capture
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://example.com",
        screenshot=True,
        screenshot_wait_for=2.0  # Wait before taking screenshot
    )
    
    # Save screenshot
    if result.screenshot:
        with open("screenshot.png", "wb") as f:
            f.write(base64.b64decode(result.screenshot))

# Media Extraction
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(url="https://example.com")
    
    # Process images with metadata
    for image in result.media['images']:
        print(f"Image: {image['src']}")
        print(f"Alt text: {image['alt']}")
        print(f"Context: {image['desc']}")
        print(f"Relevance score: {image['score']}")
    
    # Process videos and audio
    for video in result.media['videos']:
        print(f"Video: {video['src']}")
    for audio in result.media['audios']:
        print(f"Audio: {audio['src']}")

9. Structured Data Extraction & Chunking

The library supports multiple strategies for structured data extraction and content chunking.

# LLM-based Extraction
class NewsArticle(BaseModel):
    title: str
    content: str
    author: str

extraction_strategy = LLMExtractionStrategy(
    provider='openai/gpt-4',
    api_token="your-token",
    schema=NewsArticle.schema(),
    instruction="Extract news article details",
    chunk_token_threshold=1000,
    overlap_rate=0.1
)

# CSS-based Extraction
schema = {
    "name": "Product Listing",
    "baseSelector": ".product-card",
    "fields": [
        {
            "name": "title",
            "selector": "h2",
            "type": "text"
        },
        {
            "name": "price",
            "selector": ".price",
            "type": "text",
            "transform": "strip"
        }
    ]
}

css_strategy = JsonCssExtractionStrategy(schema)

# Text Chunking
from crawl4ai.chunking_strategy import OverlappingWindowChunking

chunking_strategy = OverlappingWindowChunking(
    window_size=1000,
    overlap=100
)

async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://example.com",
        extraction_strategy=extraction_strategy,
        chunking_strategy=chunking_strategy
    )

10. Content Cleaning & Processing

The library provides extensive content cleaning and processing capabilities, ensuring high-quality output in various formats.

# Basic Content Cleaning
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://example.com",
        remove_overlay_elements=True,  # Remove popups/modals
        process_iframes=True,          # Process iframe content
        word_count_threshold=10        # Minimum words per block
    )
    
    print(result.cleaned_html)    # Clean HTML
    print(result.fit_html)        # Most relevant HTML content
    print(result.fit_markdown)    # Most relevant markdown content

# Advanced Content Processing
async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://example.com",
        excluded_tags=['form', 'header', 'footer', 'nav'],
        html2text={
            "escape_dot": False,
            "body_width": 0,
            "protect_links": True,
            "unicode_snob": True,
            "ignore_links": False,
            "ignore_images": False,
            "ignore_emphasis": False,
            "bypass_tables": False,
            "ignore_tables": False
        }
    )

Advanced Usage Patterns

1. Combining Multiple Features

async with AsyncWebCrawler(
    browser_type="chromium",
    headless=False,
    verbose=True
) as crawler:
    result = await crawler.arun(
        url="https://example.com",
        # Anti-bot measures
        magic=True,
        simulate_user=True,
        
        # Content selection
        css_selector="article.main",
        word_count_threshold=10,
        
        # Dynamic content handling
        js_code="window.scrollTo(0, document.body.scrollHeight);",
        wait_for="css:.dynamic-content",
        
        # Content filtering
        exclude_external_links=True,
        exclude_social_media_links=True,
        
        # Media handling
        screenshot=True,
        process_iframes=True,
        
        # Content cleaning
        remove_overlay_elements=True
    )

2. Custom Extraction Pipeline

# Define custom schemas and strategies
class Article(BaseModel):
    title: str
    content: str
    date: str

# CSS extraction for initial content
css_schema = {
    "name": "Article Extraction",
    "baseSelector": "article",
    "fields": [
        {"name": "title", "selector": "h1", "type": "text"},
        {"name": "content", "selector": ".content", "type": "html"},
        {"name": "date", "selector": ".date", "type": "text"}
    ]
}

# LLM processing for semantic analysis
llm_strategy = LLMExtractionStrategy(
    provider="ollama/nemotron",
    api_token="your-token",
    schema=Article.schema(),
    instruction="Extract and clean article content"
)

# Chunking strategy for large content
chunking = OverlappingWindowChunking(window_size=1000, overlap=100)

async with AsyncWebCrawler() as crawler:
    # First pass: Extract structure
    css_result = await crawler.arun(
        url="https://example.com",
        extraction_strategy=JsonCssExtractionStrategy(css_schema)
    )
    
    # Second pass: Semantic processing
    llm_result = await crawler.arun(
        url="https://example.com",
        extraction_strategy=llm_strategy,
        chunking_strategy=chunking
    )

3. Website Crawling with Custom Processing

class CustomWebsiteCrawler:
    def __init__(self, crawler: AsyncWebCrawler):
        self.crawler = crawler
        self.results = {}

    async def process_page(self, url: str) -> Dict:
        result = await self.crawler.arun(
            url=url,
            magic=True,
            word_count_threshold=10,
            exclude_external_links=True,
            process_iframes=True,
            remove_overlay_elements=True
        )
        
        # Process internal links
        internal_links = [
            link['href'] for link in result.links['internal']
            if self._is_valid_link(link['href'])
        ]
        
        # Extract media
        media_urls = [img['src'] for img in result.media['images']]
        
        return {
            'content': result.markdown,
            'links': internal_links,
            'media': media_urls,
            'metadata': result.metadata
        }

    async def crawl_website(self, start_url: str, max_depth: int = 2):
        visited = set()
        queue = [(start_url, 0)]
        
        while queue:
            url, depth = queue.pop(0)
            if depth > max_depth or url in visited:
                continue
                
            visited.add(url)
            self.results[url] = await self.process_page(url)