Update Documentation

2024-10-27 19:24:46 +08:00
parent 38474bd66a
commit 4239654722
111 changed files with 7680 additions and 53 deletions
--- a/docs/md_v2/api/arun.md
+++ b/docs/md_v2/api/arun.md
@@ -0,0 +1,226 @@
+# Complete Parameter Guide for arun()
+
+The following parameters can be passed to the `arun()` method. They are organized by their primary usage context and functionality.
+
+## Core Parameters
+
+```python
+await crawler.arun(
+    url="https://example.com",   # Required: URL to crawl
+    verbose=True,               # Enable detailed logging
+    bypass_cache=False,         # Skip cache for this request
+    warmup=True                # Whether to run warmup check
+)
+```
+
+## Content Processing Parameters
+
+### Text Processing
+```python
+await crawler.arun(
+    word_count_threshold=10,                # Minimum words per content block
+    image_description_min_word_threshold=5,  # Minimum words for image descriptions
+    only_text=False,                        # Extract only text content
+    excluded_tags=['form', 'nav'],          # HTML tags to exclude
+    keep_data_attributes=False,             # Preserve data-* attributes
+)
+```
+
+### Content Selection
+```python
+await crawler.arun(
+    css_selector=".main-content",  # CSS selector for content extraction
+    remove_forms=True,             # Remove all form elements
+    remove_overlay_elements=True,  # Remove popups/modals/overlays
+)
+```
+
+### Link Handling
+```python
+await crawler.arun(
+    exclude_external_links=True,          # Remove external links
+    exclude_social_media_links=True,      # Remove social media links
+    exclude_external_images=True,         # Remove external images
+    exclude_domains=["ads.example.com"],  # Specific domains to exclude
+    social_media_domains=[               # Additional social media domains
+        "facebook.com",
+        "twitter.com",
+        "instagram.com"
+    ]
+)
+```
+
+## Browser Control Parameters
+
+### Basic Browser Settings
+```python
+await crawler.arun(
+    headless=True,                # Run browser in headless mode
+    browser_type="chromium",      # Browser engine: "chromium", "firefox", "webkit"
+    page_timeout=60000,          # Page load timeout in milliseconds
+    user_agent="custom-agent",    # Custom user agent
+)
+```
+
+### Navigation and Waiting
+```python
+await crawler.arun(
+    wait_for="css:.dynamic-content",  # Wait for element/condition
+    delay_before_return_html=2.0,     # Wait before returning HTML (seconds)
+)
+```
+
+### JavaScript Execution
+```python
+await crawler.arun(
+    js_code=[                     # JavaScript to execute (string or list)
+        "window.scrollTo(0, document.body.scrollHeight);",
+        "document.querySelector('.load-more').click();"
+    ],
+    js_only=False,               # Only execute JavaScript without reloading page
+)
+```
+
+### Anti-Bot Features
+```python
+await crawler.arun(
+    magic=True,              # Enable all anti-detection features
+    simulate_user=True,      # Simulate human behavior
+    override_navigator=True  # Override navigator properties
+)
+```
+
+### Session Management
+```python
+await crawler.arun(
+    session_id="my_session",  # Session identifier for persistent browsing
+)
+```
+
+### Screenshot Options
+```python
+await crawler.arun(
+    screenshot=True,              # Take page screenshot
+    screenshot_wait_for=2.0,      # Wait before screenshot (seconds)
+)
+```
+
+### Proxy Configuration
+```python
+await crawler.arun(
+    proxy="http://proxy.example.com:8080",     # Simple proxy URL
+    proxy_config={                             # Advanced proxy settings
+        "server": "http://proxy.example.com:8080",
+        "username": "user",
+        "password": "pass"
+    }
+)
+```
+
+## Content Extraction Parameters
+
+### Extraction Strategy
+```python
+await crawler.arun(
+    extraction_strategy=LLMExtractionStrategy(
+        provider="ollama/llama2",
+        schema=MySchema.schema(),
+        instruction="Extract specific data"
+    )
+)
+```
+
+### Chunking Strategy
+```python
+await crawler.arun(
+    chunking_strategy=RegexChunking(
+        patterns=[r'\n\n', r'\.\s+']
+    )
+)
+```
+
+### HTML to Text Options
+```python
+await crawler.arun(
+    html2text={
+        "ignore_links": False,
+        "ignore_images": False,
+        "escape_dot": False,
+        "body_width": 0,
+        "protect_links": True,
+        "unicode_snob": True
+    }
+)
+```
+
+## Debug Options
+```python
+await crawler.arun(
+    log_console=True,   # Log browser console messages
+)
+```
+
+## Parameter Interactions and Notes
+
+1. **Magic Mode Combinations**
+   ```python
+   # Full anti-detection setup
+   await crawler.arun(
+       magic=True,
+       headless=False,
+       simulate_user=True,
+       override_navigator=True
+   )
+   ```
+
+2. **Dynamic Content Handling**
+   ```python
+   # Handle lazy-loaded content
+   await crawler.arun(
+       js_code="window.scrollTo(0, document.body.scrollHeight);",
+       wait_for="css:.lazy-content",
+       delay_before_return_html=2.0
+   )
+   ```
+
+3. **Content Extraction Pipeline**
+   ```python
+   # Complete extraction setup
+   await crawler.arun(
+       css_selector=".main-content",
+       word_count_threshold=20,
+       extraction_strategy=my_strategy,
+       chunking_strategy=my_chunking,
+       process_iframes=True,
+       remove_overlay_elements=True
+   )
+   ```
+
+## Best Practices
+
+1. **Performance Optimization**
+   ```python
+   await crawler.arun(
+       bypass_cache=False,           # Use cache when possible
+       word_count_threshold=10,      # Filter out noise
+       process_iframes=False         # Skip iframes if not needed
+   )
+   ```
+
+2. **Reliable Scraping**
+   ```python
+   await crawler.arun(
+       magic=True,                   # Enable anti-detection
+       delay_before_return_html=1.0, # Wait for dynamic content
+       page_timeout=60000           # Longer timeout for slow pages
+   )
+   ```
+
+3. **Clean Content**
+   ```python
+   await crawler.arun(
+       remove_overlay_elements=True,  # Remove popups
+       excluded_tags=['nav', 'aside'],# Remove unnecessary elements
+       keep_data_attributes=False     # Remove data attributes
+   )
+   ```
--- a/docs/md_v2/api/async-webcrawler.md
+++ b/docs/md_v2/api/async-webcrawler.md
@@ -0,0 +1,320 @@
+# AsyncWebCrawler
+
+The `AsyncWebCrawler` class is the main interface for web crawling operations. It provides asynchronous web crawling capabilities with extensive configuration options.
+
+## Constructor
+
+```python
+AsyncWebCrawler(
+    # Browser Settings
+    browser_type: str = "chromium",         # Options: "chromium", "firefox", "webkit"
+    headless: bool = True,                  # Run browser in headless mode
+    verbose: bool = False,                  # Enable verbose logging
+    
+    # Cache Settings
+    always_by_pass_cache: bool = False,     # Always bypass cache
+    base_directory: str = str(Path.home()), # Base directory for cache
+    
+    # Network Settings
+    proxy: str = None,                      # Simple proxy URL
+    proxy_config: Dict = None,              # Advanced proxy configuration
+    
+    # Browser Behavior
+    sleep_on_close: bool = False,           # Wait before closing browser
+    
+    # Custom Settings
+    user_agent: str = None,                 # Custom user agent
+    headers: Dict[str, str] = {},           # Custom HTTP headers
+    js_code: Union[str, List[str]] = None,  # Default JavaScript to execute
+)
+```
+
+### Parameters in Detail
+
+#### Browser Settings
+
+- **browser_type** (str, optional)
+  - Default: `"chromium"`
+  - Options: `"chromium"`, `"firefox"`, `"webkit"`
+  - Controls which browser engine to use
+  ```python
+  # Example: Using Firefox
+  crawler = AsyncWebCrawler(browser_type="firefox")
+  ```
+
+- **headless** (bool, optional)
+  - Default: `True`
+  - When `True`, browser runs without GUI
+  - Set to `False` for debugging
+  ```python
+  # Visible browser for debugging
+  crawler = AsyncWebCrawler(headless=False)
+  ```
+
+- **verbose** (bool, optional)
+  - Default: `False`
+  - Enables detailed logging
+  ```python
+  # Enable detailed logging
+  crawler = AsyncWebCrawler(verbose=True)
+  ```
+
+#### Cache Settings
+
+- **always_by_pass_cache** (bool, optional)
+  - Default: `False`
+  - When `True`, always fetches fresh content
+  ```python
+  # Always fetch fresh content
+  crawler = AsyncWebCrawler(always_by_pass_cache=True)
+  ```
+
+- **base_directory** (str, optional)
+  - Default: User's home directory
+  - Base path for cache storage
+  ```python
+  # Custom cache directory
+  crawler = AsyncWebCrawler(base_directory="/path/to/cache")
+  ```
+
+#### Network Settings
+
+- **proxy** (str, optional)
+  - Simple proxy URL
+  ```python
+  # Using simple proxy
+  crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080")
+  ```
+
+- **proxy_config** (Dict, optional)
+  - Advanced proxy configuration with authentication
+  ```python
+  # Advanced proxy with auth
+  crawler = AsyncWebCrawler(proxy_config={
+      "server": "http://proxy.example.com:8080",
+      "username": "user",
+      "password": "pass"
+  })
+  ```
+
+#### Browser Behavior
+
+- **sleep_on_close** (bool, optional)
+  - Default: `False`
+  - Adds delay before closing browser
+  ```python
+  # Wait before closing
+  crawler = AsyncWebCrawler(sleep_on_close=True)
+  ```
+
+#### Custom Settings
+
+- **user_agent** (str, optional)
+  - Custom user agent string
+  ```python
+  # Custom user agent
+  crawler = AsyncWebCrawler(
+      user_agent="Mozilla/5.0 (Custom Agent) Chrome/90.0"
+  )
+  ```
+
+- **headers** (Dict[str, str], optional)
+  - Custom HTTP headers
+  ```python
+  # Custom headers
+  crawler = AsyncWebCrawler(
+      headers={
+          "Accept-Language": "en-US",
+          "Custom-Header": "Value"
+      }
+  )
+  ```
+
+- **js_code** (Union[str, List[str]], optional)
+  - Default JavaScript to execute on each page
+  ```python
+  # Default JavaScript
+  crawler = AsyncWebCrawler(
+      js_code=[
+          "window.scrollTo(0, document.body.scrollHeight);",
+          "document.querySelector('.load-more').click();"
+      ]
+  )
+  ```
+
+## Methods
+
+### arun()
+
+The primary method for crawling web pages.
+
+```python
+async def arun(
+    # Required
+    url: str,                              # URL to crawl
+    
+    # Content Selection
+    css_selector: str = None,              # CSS selector for content
+    word_count_threshold: int = 10,        # Minimum words per block
+    
+    # Cache Control
+    bypass_cache: bool = False,            # Bypass cache for this request
+    
+    # Session Management
+    session_id: str = None,                # Session identifier
+    
+    # Screenshot Options
+    screenshot: bool = False,              # Take screenshot
+    screenshot_wait_for: float = None,     # Wait before screenshot
+    
+    # Content Processing
+    process_iframes: bool = False,         # Process iframe content
+    remove_overlay_elements: bool = False, # Remove popups/modals
+    
+    # Anti-Bot Settings
+    simulate_user: bool = False,           # Simulate human behavior
+    override_navigator: bool = False,      # Override navigator properties
+    magic: bool = False,                   # Enable all anti-detection
+    
+    # Content Filtering
+    excluded_tags: List[str] = None,       # HTML tags to exclude
+    exclude_external_links: bool = False,  # Remove external links
+    exclude_social_media_links: bool = False, # Remove social media links
+    
+    # JavaScript Handling
+    js_code: Union[str, List[str]] = None, # JavaScript to execute
+    wait_for: str = None,                  # Wait condition
+    
+    # Page Loading
+    page_timeout: int = 60000,            # Page load timeout (ms)
+    delay_before_return_html: float = None, # Wait before return
+    
+    # Extraction
+    extraction_strategy: ExtractionStrategy = None  # Extraction strategy
+) -> CrawlResult:
+```
+
+### Usage Examples
+
+#### Basic Crawling
+```python
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+#### Advanced Crawling
+```python
+async with AsyncWebCrawler(
+    browser_type="firefox",
+    verbose=True,
+    headers={"Custom-Header": "Value"}
+) as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        css_selector=".main-content",
+        word_count_threshold=20,
+        process_iframes=True,
+        magic=True,
+        wait_for="css:.dynamic-content",
+        screenshot=True
+    )
+```
+
+#### Session Management
+```python
+async with AsyncWebCrawler() as crawler:
+    # First request
+    result1 = await crawler.arun(
+        url="https://example.com/login",
+        session_id="my_session"
+    )
+    
+    # Subsequent request using same session
+    result2 = await crawler.arun(
+        url="https://example.com/protected",
+        session_id="my_session"
+    )
+```
+
+## Context Manager
+
+AsyncWebCrawler implements the async context manager protocol:
+
+```python
+async def __aenter__(self) -> 'AsyncWebCrawler':
+    # Initialize browser and resources
+    return self
+
+async def __aexit__(self, *args):
+    # Cleanup resources
+    pass
+```
+
+Always use AsyncWebCrawler with async context manager:
+```python
+async with AsyncWebCrawler() as crawler:
+    # Your crawling code here
+    pass
+```
+
+## Best Practices
+
+1. **Resource Management**
+```python
+# Always use context manager
+async with AsyncWebCrawler() as crawler:
+    # Crawler will be properly cleaned up
+    pass
+```
+
+2. **Error Handling**
+```python
+try:
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com")
+        if not result.success:
+            print(f"Crawl failed: {result.error_message}")
+except Exception as e:
+    print(f"Error: {str(e)}")
+```
+
+3. **Performance Optimization**
+```python
+# Enable caching for better performance
+crawler = AsyncWebCrawler(
+    always_by_pass_cache=False,
+    verbose=True
+)
+```
+
+4. **Anti-Detection**
+```python
+# Maximum stealth
+crawler = AsyncWebCrawler(
+    headless=True,
+    user_agent="Mozilla/5.0...",
+    headers={"Accept-Language": "en-US"}
+)
+result = await crawler.arun(
+    url="https://example.com",
+    magic=True,
+    simulate_user=True
+)
+```
+
+## Note on Browser Types
+
+Each browser type has its characteristics:
+
+- **chromium**: Best overall compatibility
+- **firefox**: Good for specific use cases
+- **webkit**: Lighter weight, good for basic crawling
+
+Choose based on your specific needs:
+```python
+# High compatibility
+crawler = AsyncWebCrawler(browser_type="chromium")
+
+# Memory efficient
+crawler = AsyncWebCrawler(browser_type="webkit")
+```
--- a/docs/md_v2/api/crawl-result.md
+++ b/docs/md_v2/api/crawl-result.md
@@ -0,0 +1,301 @@
+# CrawlResult
+
+The `CrawlResult` class represents the result of a web crawling operation. It provides access to various forms of extracted content and metadata from the crawled webpage.
+
+## Class Definition
+
+```python
+class CrawlResult(BaseModel):
+    """Result of a web crawling operation."""
+    
+    # Basic Information
+    url: str                                # Crawled URL
+    success: bool                           # Whether crawl succeeded
+    status_code: Optional[int] = None       # HTTP status code
+    error_message: Optional[str] = None     # Error message if failed
+    
+    # Content
+    html: str                              # Raw HTML content
+    cleaned_html: Optional[str] = None      # Cleaned HTML
+    fit_html: Optional[str] = None          # Most relevant HTML content
+    markdown: Optional[str] = None          # HTML converted to markdown
+    fit_markdown: Optional[str] = None      # Most relevant markdown content
+    
+    # Extracted Data
+    extracted_content: Optional[str] = None  # Content from extraction strategy
+    media: Dict[str, List[Dict]] = {}       # Extracted media information
+    links: Dict[str, List[Dict]] = {}       # Extracted links
+    metadata: Optional[dict] = None         # Page metadata
+    
+    # Additional Data
+    screenshot: Optional[str] = None         # Base64 encoded screenshot
+    session_id: Optional[str] = None         # Session identifier
+    response_headers: Optional[dict] = None  # HTTP response headers
+```
+
+## Properties and Their Data Structures
+
+### Basic Information
+
+```python
+# Access basic information
+result = await crawler.arun(url="https://example.com")
+
+print(result.url)          # "https://example.com"
+print(result.success)      # True/False
+print(result.status_code)  # 200, 404, etc.
+print(result.error_message)  # Error details if failed
+```
+
+### Content Properties
+
+#### HTML Content
+```python
+# Raw HTML
+html_content = result.html
+
+# Cleaned HTML (removed ads, popups, etc.)
+clean_content = result.cleaned_html
+
+# Most relevant HTML content
+main_content = result.fit_html
+```
+
+#### Markdown Content
+```python
+# Full markdown version
+markdown_content = result.markdown
+
+# Most relevant markdown content
+main_content = result.fit_markdown
+```
+
+### Media Content
+
+The media dictionary contains organized media elements:
+
+```python
+# Structure
+media = {
+    "images": [
+        {
+            "src": str,           # Image URL
+            "alt": str,           # Alt text
+            "desc": str,          # Contextual description
+            "score": float,       # Relevance score (0-10)
+            "type": str,          # "image"
+            "width": int,         # Image width (if available)
+            "height": int,        # Image height (if available)
+            "context": str,       # Surrounding text
+            "lazy": bool          # Whether image was lazy-loaded
+        }
+    ],
+    "videos": [
+        {
+            "src": str,           # Video URL
+            "type": str,          # "video"
+            "title": str,         # Video title
+            "poster": str,        # Thumbnail URL
+            "duration": str,      # Video duration
+            "description": str    # Video description
+        }
+    ],
+    "audios": [
+        {
+            "src": str,           # Audio URL
+            "type": str,          # "audio"
+            "title": str,         # Audio title
+            "duration": str,      # Audio duration
+            "description": str    # Audio description
+        }
+    ]
+}
+
+# Example usage
+for image in result.media["images"]:
+    if image["score"] > 5:  # High-relevance images
+        print(f"High-quality image: {image['src']}")
+        print(f"Context: {image['context']}")
+```
+
+### Link Analysis
+
+The links dictionary organizes discovered links:
+
+```python
+# Structure
+links = {
+    "internal": [
+        {
+            "href": str,          # URL
+            "text": str,          # Link text
+            "title": str,         # Title attribute
+            "type": str,          # Link type (nav, content, etc.)
+            "context": str,       # Surrounding text
+            "score": float        # Relevance score
+        }
+    ],
+    "external": [
+        {
+            "href": str,          # External URL
+            "text": str,          # Link text
+            "title": str,         # Title attribute
+            "domain": str,        # Domain name
+            "type": str,          # Link type
+            "context": str        # Surrounding text
+        }
+    ]
+}
+
+# Example usage
+for link in result.links["internal"]:
+    print(f"Internal link: {link['href']}")
+    print(f"Context: {link['context']}")
+```
+
+### Metadata
+
+The metadata dictionary contains page information:
+
+```python
+# Structure
+metadata = {
+    "title": str,                # Page title
+    "description": str,          # Meta description
+    "keywords": List[str],       # Meta keywords
+    "author": str,              # Author information
+    "published_date": str,      # Publication date
+    "modified_date": str,       # Last modified date
+    "language": str,            # Page language
+    "canonical_url": str,       # Canonical URL
+    "og_data": Dict,           # Open Graph data
+    "twitter_data": Dict       # Twitter card data
+}
+
+# Example usage
+if result.metadata:
+    print(f"Title: {result.metadata['title']}")
+    print(f"Author: {result.metadata.get('author', 'Unknown')}")
+```
+
+### Extracted Content
+
+Content from extraction strategies:
+
+```python
+# For LLM or CSS extraction strategies
+if result.extracted_content:
+    structured_data = json.loads(result.extracted_content)
+    print(structured_data)
+```
+
+### Screenshot
+
+Base64 encoded screenshot:
+
+```python
+# Save screenshot if available
+if result.screenshot:
+    import base64
+    
+    # Decode and save
+    with open("screenshot.png", "wb") as f:
+        f.write(base64.b64decode(result.screenshot))
+```
+
+## Usage Examples
+
+### Basic Content Access
+```python
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(url="https://example.com")
+    
+    if result.success:
+        # Get clean content
+        print(result.fit_markdown)
+        
+        # Process images
+        for image in result.media["images"]:
+            if image["score"] > 7:
+                print(f"High-quality image: {image['src']}")
+```
+
+### Complete Data Processing
+```python
+async def process_webpage(url: str) -> Dict:
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=url)
+        
+        if not result.success:
+            raise Exception(f"Crawl failed: {result.error_message}")
+        
+        return {
+            "content": result.fit_markdown,
+            "images": [
+                img for img in result.media["images"]
+                if img["score"] > 5
+            ],
+            "internal_links": [
+                link["href"] for link in result.links["internal"]
+            ],
+            "metadata": result.metadata,
+            "status": result.status_code
+        }
+```
+
+### Error Handling
+```python
+async def safe_crawl(url: str) -> Dict:
+    async with AsyncWebCrawler() as crawler:
+        try:
+            result = await crawler.arun(url=url)
+            
+            if not result.success:
+                return {
+                    "success": False,
+                    "error": result.error_message,
+                    "status": result.status_code
+                }
+            
+            return {
+                "success": True,
+                "content": result.fit_markdown,
+                "status": result.status_code
+            }
+            
+        except Exception as e:
+            return {
+                "success": False,
+                "error": str(e),
+                "status": None
+            }
+```
+
+## Best Practices
+
+1. **Always Check Success**
+```python
+if not result.success:
+    print(f"Error: {result.error_message}")
+    return
+```
+
+2. **Use fit_markdown for Articles**
+```python
+# Better for article content
+content = result.fit_markdown if result.fit_markdown else result.markdown
+```
+
+3. **Filter Media by Score**
+```python
+relevant_images = [
+    img for img in result.media["images"]
+    if img["score"] > 5
+]
+```
+
+4. **Handle Missing Data**
+```python
+metadata = result.metadata or {}
+title = metadata.get('title', 'Unknown Title')
+```
--- a/docs/md_v2/api/strategies.md
+++ b/docs/md_v2/api/strategies.md
@@ -0,0 +1,255 @@
+# Extraction & Chunking Strategies API
+
+This documentation covers the API reference for extraction and chunking strategies in Crawl4AI.
+
+## Extraction Strategies
+
+All extraction strategies inherit from the base `ExtractionStrategy` class and implement two key methods:
+- `extract(url: str, html: str) -> List[Dict[str, Any]]`
+- `run(url: str, sections: List[str]) -> List[Dict[str, Any]]`
+
+### LLMExtractionStrategy
+
+Used for extracting structured data using Language Models.
+
+```python
+LLMExtractionStrategy(
+    # Required Parameters
+    provider: str = DEFAULT_PROVIDER,     # LLM provider (e.g., "ollama/llama2")
+    api_token: Optional[str] = None,      # API token
+    
+    # Extraction Configuration
+    instruction: str = None,              # Custom extraction instruction
+    schema: Dict = None,                  # Pydantic model schema for structured data
+    extraction_type: str = "block",       # "block" or "schema"
+    
+    # Chunking Parameters
+    chunk_token_threshold: int = 4000,    # Maximum tokens per chunk
+    overlap_rate: float = 0.1,           # Overlap between chunks
+    word_token_rate: float = 0.75,       # Word to token conversion rate
+    apply_chunking: bool = True,         # Enable/disable chunking
+    
+    # API Configuration
+    base_url: str = None,                # Base URL for API
+    extra_args: Dict = {},               # Additional provider arguments
+    verbose: bool = False                # Enable verbose logging
+)
+```
+
+### CosineStrategy
+
+Used for content similarity-based extraction and clustering.
+
+```python
+CosineStrategy(
+    # Content Filtering
+    semantic_filter: str = None,        # Topic/keyword filter
+    word_count_threshold: int = 10,     # Minimum words per cluster
+    sim_threshold: float = 0.3,         # Similarity threshold
+    
+    # Clustering Parameters
+    max_dist: float = 0.2,             # Maximum cluster distance
+    linkage_method: str = 'ward',       # Clustering method
+    top_k: int = 3,                    # Top clusters to return
+    
+    # Model Configuration
+    model_name: str = 'sentence-transformers/all-MiniLM-L6-v2',  # Embedding model
+    
+    verbose: bool = False              # Enable verbose logging
+)
+```
+
+### JsonCssExtractionStrategy
+
+Used for CSS selector-based structured data extraction.
+
+```python
+JsonCssExtractionStrategy(
+    schema: Dict[str, Any],    # Extraction schema
+    verbose: bool = False      # Enable verbose logging
+)
+
+# Schema Structure
+schema = {
+    "name": str,              # Schema name
+    "baseSelector": str,      # Base CSS selector
+    "fields": [               # List of fields to extract
+        {
+            "name": str,      # Field name
+            "selector": str,  # CSS selector
+            "type": str,     # Field type: "text", "attribute", "html", "regex"
+            "attribute": str, # For type="attribute"
+            "pattern": str,  # For type="regex"
+            "transform": str, # Optional: "lowercase", "uppercase", "strip"
+            "default": Any    # Default value if extraction fails
+        }
+    ]
+}
+```
+
+## Chunking Strategies
+
+All chunking strategies inherit from `ChunkingStrategy` and implement the `chunk(text: str) -> list` method.
+
+### RegexChunking
+
+Splits text based on regex patterns.
+
+```python
+RegexChunking(
+    patterns: List[str] = None  # Regex patterns for splitting
+                               # Default: [r'\n\n']
+)
+```
+
+### SlidingWindowChunking
+
+Creates overlapping chunks with a sliding window approach.
+
+```python
+SlidingWindowChunking(
+    window_size: int = 100,    # Window size in words
+    step: int = 50             # Step size between windows
+)
+```
+
+### OverlappingWindowChunking
+
+Creates chunks with specified overlap.
+
+```python
+OverlappingWindowChunking(
+    window_size: int = 1000,   # Chunk size in words
+    overlap: int = 100         # Overlap size in words
+)
+```
+
+## Usage Examples
+
+### LLM Extraction
+
+```python
+from pydantic import BaseModel
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+# Define schema
+class Article(BaseModel):
+    title: str
+    content: str
+    author: str
+
+# Create strategy
+strategy = LLMExtractionStrategy(
+    provider="ollama/llama2",
+    schema=Article.schema(),
+    instruction="Extract article details"
+)
+
+# Use with crawler
+result = await crawler.arun(
+    url="https://example.com/article",
+    extraction_strategy=strategy
+)
+
+# Access extracted data
+data = json.loads(result.extracted_content)
+```
+
+### CSS Extraction
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+# Define schema
+schema = {
+    "name": "Product List",
+    "baseSelector": ".product-card",
+    "fields": [
+        {
+            "name": "title",
+            "selector": "h2.title",
+            "type": "text"
+        },
+        {
+            "name": "price",
+            "selector": ".price",
+            "type": "text",
+            "transform": "strip"
+        },
+        {
+            "name": "image",
+            "selector": "img",
+            "type": "attribute",
+            "attribute": "src"
+        }
+    ]
+}
+
+# Create and use strategy
+strategy = JsonCssExtractionStrategy(schema)
+result = await crawler.arun(
+    url="https://example.com/products",
+    extraction_strategy=strategy
+)
+```
+
+### Content Chunking
+
+```python
+from crawl4ai.chunking_strategy import OverlappingWindowChunking
+
+# Create chunking strategy
+chunker = OverlappingWindowChunking(
+    window_size=500,  # 500 words per chunk
+    overlap=50        # 50 words overlap
+)
+
+# Use with extraction strategy
+strategy = LLMExtractionStrategy(
+    provider="ollama/llama2",
+    chunking_strategy=chunker
+)
+
+result = await crawler.arun(
+    url="https://example.com/long-article",
+    extraction_strategy=strategy
+)
+```
+
+## Best Practices
+
+1. **Choose the Right Strategy**
+   - Use `LLMExtractionStrategy` for complex, unstructured content
+   - Use `JsonCssExtractionStrategy` for well-structured HTML
+   - Use `CosineStrategy` for content similarity and clustering
+
+2. **Optimize Chunking**
+   ```python
+   # For long documents
+   strategy = LLMExtractionStrategy(
+       chunk_token_threshold=2000,  # Smaller chunks
+       overlap_rate=0.1           # 10% overlap
+   )
+   ```
+
+3. **Handle Errors**
+   ```python
+   try:
+       result = await crawler.arun(
+           url="https://example.com",
+           extraction_strategy=strategy
+       )
+       if result.success:
+           content = json.loads(result.extracted_content)
+   except Exception as e:
+       print(f"Extraction failed: {e}")
+   ```
+
+4. **Monitor Performance**
+   ```python
+   strategy = CosineStrategy(
+       verbose=True,  # Enable logging
+       word_count_threshold=20,  # Filter short content
+       top_k=5  # Limit results
+   )
+   ```