Update Documentation

2024-10-27 19:24:46 +08:00
parent 38474bd66a
commit 4239654722
111 changed files with 7680 additions and 53 deletions
--- a/docs/details/features_details.md
+++ b/docs/details/features_details.md
@@ -0,0 +1,457 @@
+I'll expand the outline with detailed descriptions and examples based on all the provided files. I'll start with the first few sections:
+
+### 1. Basic Web Crawling
+Basic web crawling provides the foundation for extracting content from websites. The library supports both simple single-page crawling and recursive website crawling.
+
+```python
+# Simple page crawling
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(url="https://example.com")
+    print(result.html)        # Raw HTML
+    print(result.markdown)    # Cleaned markdown
+    print(result.cleaned_html)  # Cleaned HTML
+
+# Recursive website crawling
+class SimpleWebsiteScraper:
+    def __init__(self, crawler: AsyncWebCrawler):
+        self.crawler = crawler
+
+    async def scrape(self, start_url: str, max_depth: int):
+        results = await self.scrape_recursive(start_url, max_depth)
+        return results
+
+# Usage
+async with AsyncWebCrawler() as crawler:
+    scraper = SimpleWebsiteScraper(crawler)
+    results = await scraper.scrape("https://example.com", depth=2)
+```
+
+### 2. Browser Control Options
+The library provides extensive control over browser behavior, allowing customization of browser type, headless mode, and proxy settings.
+
+```python
+# Browser Type Selection
+async with AsyncWebCrawler(
+    browser_type="firefox",  # Options: "chromium", "firefox", "webkit"
+    headless=False,         # For visible browser
+    verbose=True           # Enable logging
+) as crawler:
+    result = await crawler.arun(url="https://example.com")
+
+# Proxy Configuration
+async with AsyncWebCrawler(
+    proxy_config={
+        "server": "http://proxy.example.com:8080",
+        "username": "user",
+        "password": "pass"
+    },
+    headers={
+        "User-Agent": "Custom User Agent",
+        "Accept-Language": "en-US,en;q=0.9"
+    }
+) as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+### 3. Content Selection & Filtering
+The library offers multiple ways to select and filter content, from CSS selectors to word count thresholds.
+
+```python
+# CSS Selector and Content Filtering
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        css_selector="article.main-content",  # Extract specific content
+        word_count_threshold=10,              # Minimum words per block
+        excluded_tags=['form', 'header'],     # Tags to exclude
+        exclude_external_links=True,          # Remove external links
+        exclude_social_media_links=True,      # Remove social media links
+        exclude_domains=["pinterest.com", "facebook.com"]  # Exclude specific domains
+    )
+
+# Custom HTML to Text Options
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        html2text={
+            "escape_dot": False,
+            "links_each_paragraph": True,
+            "protect_links": True
+        }
+    )
+```
+
+### 4. Dynamic Content Handling
+The library provides sophisticated handling of dynamic content with JavaScript execution and wait conditions.
+
+```python
+# JavaScript Execution and Wait Conditions
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        js_code=[
+            "window.scrollTo(0, document.body.scrollHeight);",
+            "document.querySelector('.load-more').click();"
+        ],
+        wait_for="css:.dynamic-content",  # Wait for element
+        delay_before_return_html=2.0      # Wait after JS execution
+    )
+
+# Smart Wait Conditions
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        wait_for="""() => {
+            return document.querySelectorAll('.item').length > 10;
+        }""",
+        page_timeout=60000  # 60 seconds timeout
+    )
+```
+
+### 5. Advanced Link Analysis
+The library provides comprehensive link analysis capabilities, distinguishing between internal and external links, with options for filtering and processing.
+
+```python
+# Basic Link Analysis
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(url="https://example.com")
+    
+    # Access internal and external links
+    for internal_link in result.links['internal']:
+        print(f"Internal: {internal_link['href']} - {internal_link['text']}")
+    
+    for external_link in result.links['external']:
+        print(f"External: {external_link['href']} - {external_link['text']}")
+
+# Advanced Link Filtering
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        exclude_external_links=True,          # Remove all external links
+        exclude_social_media_links=True,      # Remove social media links
+        exclude_social_media_domains=[                # Custom social media domains
+            "facebook.com", "twitter.com", "instagram.com"
+        ],
+        exclude_domains=["pinterest.com"]     # Specific domains to exclude
+    )
+```
+
+### 6. Anti-Bot Protection Handling
+The library includes sophisticated anti-detection mechanisms to handle websites with bot protection.
+
+```python
+# Basic Anti-Detection
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        simulate_user=True,        # Simulate human behavior
+        override_navigator=True    # Override navigator properties
+    )
+
+# Advanced Anti-Detection with Magic Mode
+async with AsyncWebCrawler(headless=False) as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        magic=True,               # Enable all anti-detection features
+        remove_overlay_elements=True,  # Remove popups/modals automatically
+        # Custom navigator properties
+        js_code="""
+        Object.defineProperty(navigator, 'webdriver', {
+            get: () => undefined
+        });
+        """
+    )
+```
+
+### 7. Session Management
+Session management allows maintaining state across multiple requests and handling cookies.
+
+```python
+# Basic Session Management
+async with AsyncWebCrawler() as crawler:
+    session_id = "my_session"
+    
+    # Login
+    login_result = await crawler.arun(
+        url="https://example.com/login",
+        session_id=session_id,
+        js_code="document.querySelector('form').submit();"
+    )
+    
+    # Use same session for subsequent requests
+    protected_result = await crawler.arun(
+        url="https://example.com/protected",
+        session_id=session_id
+    )
+    
+    # Clean up session
+    await crawler.crawler_strategy.kill_session(session_id)
+
+# Advanced Session with Custom Cookies
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        session_id="custom_session",
+        cookies=[{
+            "name": "sessionId",
+            "value": "abc123",
+            "domain": "example.com"
+        }]
+    )
+```
+
+### 8. Screenshot and Media Handling
+The library provides comprehensive media handling capabilities, including screenshots and media content extraction.
+
+```python
+# Screenshot Capture
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        screenshot=True,
+        screenshot_wait_for=2.0  # Wait before taking screenshot
+    )
+    
+    # Save screenshot
+    if result.screenshot:
+        with open("screenshot.png", "wb") as f:
+            f.write(base64.b64decode(result.screenshot))
+
+# Media Extraction
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(url="https://example.com")
+    
+    # Process images with metadata
+    for image in result.media['images']:
+        print(f"Image: {image['src']}")
+        print(f"Alt text: {image['alt']}")
+        print(f"Context: {image['desc']}")
+        print(f"Relevance score: {image['score']}")
+    
+    # Process videos and audio
+    for video in result.media['videos']:
+        print(f"Video: {video['src']}")
+    for audio in result.media['audios']:
+        print(f"Audio: {audio['src']}")
+```
+
+### 9. Structured Data Extraction & Chunking
+The library supports multiple strategies for structured data extraction and content chunking.
+
+```python
+# LLM-based Extraction
+class NewsArticle(BaseModel):
+    title: str
+    content: str
+    author: str
+
+extraction_strategy = LLMExtractionStrategy(
+    provider='openai/gpt-4',
+    api_token="your-token",
+    schema=NewsArticle.schema(),
+    instruction="Extract news article details",
+    chunk_token_threshold=1000,
+    overlap_rate=0.1
+)
+
+# CSS-based Extraction
+schema = {
+    "name": "Product Listing",
+    "baseSelector": ".product-card",
+    "fields": [
+        {
+            "name": "title",
+            "selector": "h2",
+            "type": "text"
+        },
+        {
+            "name": "price",
+            "selector": ".price",
+            "type": "text",
+            "transform": "strip"
+        }
+    ]
+}
+
+css_strategy = JsonCssExtractionStrategy(schema)
+
+# Text Chunking
+from crawl4ai.chunking_strategy import OverlappingWindowChunking
+
+chunking_strategy = OverlappingWindowChunking(
+    window_size=1000,
+    overlap=100
+)
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        extraction_strategy=extraction_strategy,
+        chunking_strategy=chunking_strategy
+    )
+```
+
+
+### 10. Content Cleaning & Processing
+The library provides extensive content cleaning and processing capabilities, ensuring high-quality output in various formats.
+
+```python
+# Basic Content Cleaning
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        remove_overlay_elements=True,  # Remove popups/modals
+        process_iframes=True,          # Process iframe content
+        word_count_threshold=10        # Minimum words per block
+    )
+    
+    print(result.cleaned_html)    # Clean HTML
+    print(result.fit_html)        # Most relevant HTML content
+    print(result.fit_markdown)    # Most relevant markdown content
+
+# Advanced Content Processing
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        excluded_tags=['form', 'header', 'footer', 'nav'],
+        html2text={
+            "escape_dot": False,
+            "body_width": 0,
+            "protect_links": True,
+            "unicode_snob": True,
+            "ignore_links": False,
+            "ignore_images": False,
+            "ignore_emphasis": False,
+            "bypass_tables": False,
+            "ignore_tables": False
+        }
+    )
+```
+
+### Advanced Usage Patterns
+
+#### 1. Combining Multiple Features
+```python
+async with AsyncWebCrawler(
+    browser_type="chromium",
+    headless=False,
+    verbose=True
+) as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        # Anti-bot measures
+        magic=True,
+        simulate_user=True,
+        
+        # Content selection
+        css_selector="article.main",
+        word_count_threshold=10,
+        
+        # Dynamic content handling
+        js_code="window.scrollTo(0, document.body.scrollHeight);",
+        wait_for="css:.dynamic-content",
+        
+        # Content filtering
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+        
+        # Media handling
+        screenshot=True,
+        process_iframes=True,
+        
+        # Content cleaning
+        remove_overlay_elements=True
+    )
+```
+
+#### 2. Custom Extraction Pipeline
+```python
+# Define custom schemas and strategies
+class Article(BaseModel):
+    title: str
+    content: str
+    date: str
+
+# CSS extraction for initial content
+css_schema = {
+    "name": "Article Extraction",
+    "baseSelector": "article",
+    "fields": [
+        {"name": "title", "selector": "h1", "type": "text"},
+        {"name": "content", "selector": ".content", "type": "html"},
+        {"name": "date", "selector": ".date", "type": "text"}
+    ]
+}
+
+# LLM processing for semantic analysis
+llm_strategy = LLMExtractionStrategy(
+    provider="ollama/nemotron",
+    api_token="your-token",
+    schema=Article.schema(),
+    instruction="Extract and clean article content"
+)
+
+# Chunking strategy for large content
+chunking = OverlappingWindowChunking(window_size=1000, overlap=100)
+
+async with AsyncWebCrawler() as crawler:
+    # First pass: Extract structure
+    css_result = await crawler.arun(
+        url="https://example.com",
+        extraction_strategy=JsonCssExtractionStrategy(css_schema)
+    )
+    
+    # Second pass: Semantic processing
+    llm_result = await crawler.arun(
+        url="https://example.com",
+        extraction_strategy=llm_strategy,
+        chunking_strategy=chunking
+    )
+```
+
+#### 3. Website Crawling with Custom Processing
+```python
+class CustomWebsiteCrawler:
+    def __init__(self, crawler: AsyncWebCrawler):
+        self.crawler = crawler
+        self.results = {}
+
+    async def process_page(self, url: str) -> Dict:
+        result = await self.crawler.arun(
+            url=url,
+            magic=True,
+            word_count_threshold=10,
+            exclude_external_links=True,
+            process_iframes=True,
+            remove_overlay_elements=True
+        )
+        
+        # Process internal links
+        internal_links = [
+            link['href'] for link in result.links['internal']
+            if self._is_valid_link(link['href'])
+        ]
+        
+        # Extract media
+        media_urls = [img['src'] for img in result.media['images']]
+        
+        return {
+            'content': result.markdown,
+            'links': internal_links,
+            'media': media_urls,
+            'metadata': result.metadata
+        }
+
+    async def crawl_website(self, start_url: str, max_depth: int = 2):
+        visited = set()
+        queue = [(start_url, 0)]
+        
+        while queue:
+            url, depth = queue.pop(0)
+            if depth > max_depth or url in visited:
+                continue
+                
+            visited.add(url)
+            self.results[url] = await self.process_page(url)
+```
+