Merge branch 'main' of https://github.com/unclecode/crawl4ai

2024-11-05 12:58:30 +00:00
parent e8aaa57cb2 1e7db0d293
commit 1222e456fb
32 changed files with 2280 additions and 62170 deletions
--- a/docs/assets/pitch-dark.png
+++ b/docs/assets/pitch-dark.png
--- a/docs/assets/pitch-dark.svg
+++ b/docs/assets/pitch-dark.svg
@@ -0,0 +1,64 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 800 500">
+    <!-- Background -->
+    <rect width="800" height="500" fill="#1a1a1a"/>
+    
+    <!-- Opportunities Section -->
+    <g transform="translate(50,50)">
+        <!-- Opportunity 1 Box -->
+        <rect x="0" y="0" width="300" height="150" rx="10" fill="#1a2d3d" stroke="#64b5f6" stroke-width="2"/>
+        <text x="150" y="30" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#64b5f6">Data Capitalization Opportunity</text>
+        <text x="150" y="60" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">
+            <tspan x="150" dy="0">Transform digital footprints into assets</tspan>
+            <tspan x="150" dy="20">Personal data as capital</tspan>
+            <tspan x="150" dy="20">Enterprise knowledge valuation</tspan>
+            <tspan x="150" dy="20">New form of wealth creation</tspan>
+        </text>
+
+        <!-- Opportunity 2 Box -->
+        <rect x="0" y="200" width="300" height="150" rx="10" fill="#1a2d1a" stroke="#81c784" stroke-width="2"/>
+        <text x="150" y="230" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#81c784">Authentic Data Potential</text>
+        <text x="150" y="260" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">
+            <tspan x="150" dy="0">Vast reservoir of real insights</tspan>
+            <tspan x="150" dy="20">Enhanced AI development</tspan>
+            <tspan x="150" dy="20">Diverse human knowledge</tspan>
+            <tspan x="150" dy="20">Willing participation model</tspan>
+        </text>
+    </g>
+
+    <!-- Development Pathway -->
+    <g transform="translate(450,50)">
+        <!-- Step 1 Box -->
+        <rect x="0" y="0" width="300" height="100" rx="10" fill="#2d1a2d" stroke="#ce93d8" stroke-width="2"/>
+        <text x="150" y="35" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ce93d8">1. Open-Source Foundation</text>
+        <text x="150" y="65" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">Data extraction engine &amp; community development</text>
+
+        <!-- Step 2 Box -->
+        <rect x="0" y="125" width="300" height="100" rx="10" fill="#2d1a2d" stroke="#ce93d8" stroke-width="2"/>
+        <text x="150" y="160" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ce93d8">2. Data Capitalization Platform</text>
+        <text x="150" y="190" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">Tools to structure &amp; value digital assets</text>
+
+        <!-- Step 3 Box -->
+        <rect x="0" y="250" width="300" height="100" rx="10" fill="#2d1a2d" stroke="#ce93d8" stroke-width="2"/>
+        <text x="150" y="285" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ce93d8">3. Shared Data Marketplace</text>
+        <text x="150" y="315" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">Economic platform for data exchange</text>
+    </g>
+
+    <!-- Connecting Arrows -->
+    <g transform="translate(400,125)">
+        <path d="M-20,0 L40,0" stroke="#666" stroke-width="2" marker-end="url(#arrowhead)"/>
+        <path d="M-20,200 L40,200" stroke="#666" stroke-width="2" marker-end="url(#arrowhead)"/>
+    </g>
+
+    <!-- Arrow Marker -->
+    <defs>
+        <marker id="arrowhead" markerWidth="10" markerHeight="7" refX="9" refY="3.5" orient="auto">
+            <polygon points="0 0, 10 3.5, 0 7" fill="#666"/>
+        </marker>
+    </defs>
+
+    <!-- Vision Box at Bottom -->
+    <g transform="translate(200,420)">
+        <rect x="0" y="0" width="400" height="60" rx="10" fill="#2d2613" stroke="#ffd54f" stroke-width="2"/>
+        <text x="200" y="35" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ffd54f">Economic Vision: Shared Data Economy</text>
+    </g>
+</svg>
--- a/docs/md_v2/advanced/hooks.md
+++ b/docs/md_v2/advanced/hooks.md
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -0,0 +1,35 @@
+# Parameter Reference Table
+
+| File Name | Parameter Name | Code Usage | Strategy/Class | Description |
+|-----------|---------------|------------|----------------|-------------|
+| async_crawler_strategy.py | user_agent | `kwargs.get("user_agent")` | AsyncPlaywrightCrawlerStrategy | User agent string for browser identification |
+| async_crawler_strategy.py | proxy | `kwargs.get("proxy")` | AsyncPlaywrightCrawlerStrategy | Proxy server configuration for network requests |
+| async_crawler_strategy.py | proxy_config | `kwargs.get("proxy_config")` | AsyncPlaywrightCrawlerStrategy | Detailed proxy configuration including auth |
+| async_crawler_strategy.py | headless | `kwargs.get("headless", True)` | AsyncPlaywrightCrawlerStrategy | Whether to run browser in headless mode |
+| async_crawler_strategy.py | browser_type | `kwargs.get("browser_type", "chromium")` | AsyncPlaywrightCrawlerStrategy | Type of browser to use (chromium/firefox/webkit) |
+| async_crawler_strategy.py | headers | `kwargs.get("headers", {})` | AsyncPlaywrightCrawlerStrategy | Custom HTTP headers for requests |
+| async_crawler_strategy.py | verbose | `kwargs.get("verbose", False)` | AsyncPlaywrightCrawlerStrategy | Enable detailed logging output |
+| async_crawler_strategy.py | sleep_on_close | `kwargs.get("sleep_on_close", False)` | AsyncPlaywrightCrawlerStrategy | Add delay before closing browser |
+| async_crawler_strategy.py | use_managed_browser | `kwargs.get("use_managed_browser", False)` | AsyncPlaywrightCrawlerStrategy | Use managed browser instance |
+| async_crawler_strategy.py | user_data_dir | `kwargs.get("user_data_dir", None)` | AsyncPlaywrightCrawlerStrategy | Custom directory for browser profile data |
+| async_crawler_strategy.py | session_id | `kwargs.get("session_id")` | AsyncPlaywrightCrawlerStrategy | Unique identifier for browser session |
+| async_crawler_strategy.py | override_navigator | `kwargs.get("override_navigator", False)` | AsyncPlaywrightCrawlerStrategy | Override browser navigator properties |
+| async_crawler_strategy.py | simulate_user | `kwargs.get("simulate_user", False)` | AsyncPlaywrightCrawlerStrategy | Simulate human-like behavior |
+| async_crawler_strategy.py | magic | `kwargs.get("magic", False)` | AsyncPlaywrightCrawlerStrategy | Enable advanced anti-detection features |
+| async_crawler_strategy.py | log_console | `kwargs.get("log_console", False)` | AsyncPlaywrightCrawlerStrategy | Log browser console messages |
+| async_crawler_strategy.py | js_only | `kwargs.get("js_only", False)` | AsyncPlaywrightCrawlerStrategy | Only execute JavaScript without page load |
+| async_crawler_strategy.py | page_timeout | `kwargs.get("page_timeout", 60000)` | AsyncPlaywrightCrawlerStrategy | Timeout for page load in milliseconds |
+| async_crawler_strategy.py | ignore_body_visibility | `kwargs.get("ignore_body_visibility", True)` | AsyncPlaywrightCrawlerStrategy | Process page even if body is hidden |
+| async_crawler_strategy.py | js_code | `kwargs.get("js_code", kwargs.get("js", self.js_code))` | AsyncPlaywrightCrawlerStrategy | Custom JavaScript code to execute |
+| async_crawler_strategy.py | wait_for | `kwargs.get("wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait for specific element/condition |
+| async_crawler_strategy.py | process_iframes | `kwargs.get("process_iframes", False)` | AsyncPlaywrightCrawlerStrategy | Extract content from iframes |
+| async_crawler_strategy.py | delay_before_return_html | `kwargs.get("delay_before_return_html")` | AsyncPlaywrightCrawlerStrategy | Additional delay before returning HTML |
+| async_crawler_strategy.py | remove_overlay_elements | `kwargs.get("remove_overlay_elements", False)` | AsyncPlaywrightCrawlerStrategy | Remove pop-ups and overlay elements |
+| async_crawler_strategy.py | screenshot | `kwargs.get("screenshot")` | AsyncPlaywrightCrawlerStrategy | Take page screenshot |
+| async_crawler_strategy.py | screenshot_wait_for | `kwargs.get("screenshot_wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait before taking screenshot |
+| async_crawler_strategy.py | semaphore_count | `kwargs.get("semaphore_count", 5)` | AsyncPlaywrightCrawlerStrategy | Concurrent request limit |
+| async_webcrawler.py | verbose | `kwargs.get("verbose", False)` | AsyncWebCrawler | Enable detailed logging |
+| async_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request |
+| async_webcrawler.py | session_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse |
+| async_webcrawler.py | only_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content |
+| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl |
--- a/docs/md_v2/basic/docker-deploymeny.md
+++ b/docs/md_v2/basic/docker-deploymeny.md
@@ -0,0 +1,459 @@
+# Docker Deployment
+
+Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments.
+
+## Quick Start 🚀
+
+Pull and run the basic version:
+
+```bash
+docker pull unclecode/crawl4ai:basic
+docker run -p 11235:11235 unclecode/crawl4ai:basic
+```
+
+Test the deployment:
+```python
+import requests
+
+# Test health endpoint
+health = requests.get("http://localhost:11235/health")
+print("Health check:", health.json())
+
+# Test basic crawl
+response = requests.post(
+    "http://localhost:11235/crawl",
+    json={
+        "urls": "https://www.nbcnews.com/business",
+        "priority": 10
+    }
+)
+task_id = response.json()["task_id"]
+print("Task ID:", task_id)
+```
+
+## Available Images 🏷️
+
+- `unclecode/crawl4ai:basic` - Basic web crawling capabilities
+- `unclecode/crawl4ai:all` - Full installation with all features
+- `unclecode/crawl4ai:gpu` - GPU-enabled version for ML features
+
+## Configuration Options 🔧
+
+### Environment Variables
+
+```bash
+docker run -p 11235:11235 \
+    -e MAX_CONCURRENT_TASKS=5 \
+    -e OPENAI_API_KEY=your_key \
+    unclecode/crawl4ai:all
+```
+
+### Volume Mounting
+
+Mount a directory for persistent data:
+```bash
+docker run -p 11235:11235 \
+    -v $(pwd)/data:/app/data \
+    unclecode/crawl4ai:all
+```
+
+### Resource Limits
+
+Control container resources:
+```bash
+docker run -p 11235:11235 \
+    --memory=4g \
+    --cpus=2 \
+    unclecode/crawl4ai:all
+```
+
+## Usage Examples 📝
+
+### Basic Crawling
+
+```python
+request = {
+    "urls": "https://www.nbcnews.com/business",
+    "priority": 10
+}
+
+response = requests.post("http://localhost:11235/crawl", json=request)
+task_id = response.json()["task_id"]
+
+# Get results
+result = requests.get(f"http://localhost:11235/task/{task_id}")
+```
+
+### Structured Data Extraction
+
+```python
+schema = {
+    "name": "Crypto Prices",
+    "baseSelector": ".cds-tableRow-t45thuk",
+    "fields": [
+        {
+            "name": "crypto",
+            "selector": "td:nth-child(1) h2",
+            "type": "text",
+        },
+        {
+            "name": "price",
+            "selector": "td:nth-child(2)",
+            "type": "text",
+        }
+    ],
+}
+
+request = {
+    "urls": "https://www.coinbase.com/explore",
+    "extraction_config": {
+        "type": "json_css",
+        "params": {"schema": schema}
+    }
+}
+```
+
+### Dynamic Content Handling
+
+```python
+request = {
+    "urls": "https://www.nbcnews.com/business",
+    "js_code": [
+        "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
+    ],
+    "wait_for": "article.tease-card:nth-child(10)"
+}
+```
+
+### AI-Powered Extraction (Full Version)
+
+```python
+request = {
+    "urls": "https://www.nbcnews.com/business",
+    "extraction_config": {
+        "type": "cosine",
+        "params": {
+            "semantic_filter": "business finance economy",
+            "word_count_threshold": 10,
+            "max_dist": 0.2,
+            "top_k": 3
+        }
+    }
+}
+```
+
+## Platform-Specific Instructions 💻
+
+### macOS
+```bash
+docker pull unclecode/crawl4ai:basic
+docker run -p 11235:11235 unclecode/crawl4ai:basic
+```
+
+### Ubuntu
+```bash
+# Basic version
+docker pull unclecode/crawl4ai:basic
+docker run -p 11235:11235 unclecode/crawl4ai:basic
+
+# With GPU support
+docker pull unclecode/crawl4ai:gpu
+docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu
+```
+
+### Windows (PowerShell)
+```powershell
+docker pull unclecode/crawl4ai:basic
+docker run -p 11235:11235 unclecode/crawl4ai:basic
+```
+
+## Testing 🧪
+
+Save this as `test_docker.py`:
+
+```python
+import requests
+import json
+import time
+import sys
+
+class Crawl4AiTester:
+    def __init__(self, base_url: str = "http://localhost:11235"):
+        self.base_url = base_url
+        
+    def submit_and_wait(self, request_data: dict, timeout: int = 300) -> dict:
+        # Submit crawl job
+        response = requests.post(f"{self.base_url}/crawl", json=request_data)
+        task_id = response.json()["task_id"]
+        print(f"Task ID: {task_id}")
+        
+        # Poll for result
+        start_time = time.time()
+        while True:
+            if time.time() - start_time > timeout:
+                raise TimeoutError(f"Task {task_id} timeout")
+                
+            result = requests.get(f"{self.base_url}/task/{task_id}")
+            status = result.json()
+            
+            if status["status"] == "completed":
+                return status
+                
+            time.sleep(2)
+
+def test_deployment():
+    tester = Crawl4AiTester()
+    
+    # Test basic crawl
+    request = {
+        "urls": "https://www.nbcnews.com/business",
+        "priority": 10
+    }
+    
+    result = tester.submit_and_wait(request)
+    print("Basic crawl successful!")
+    print(f"Content length: {len(result['result']['markdown'])}")
+
+if __name__ == "__main__":
+    test_deployment()
+```
+
+## Advanced Configuration ⚙️
+
+### Crawler Parameters
+
+The `crawler_params` field allows you to configure the browser instance and crawling behavior. Here are key parameters you can use:
+
+```python
+request = {
+    "urls": "https://example.com",
+    "crawler_params": {
+        # Browser Configuration
+        "headless": True,                    # Run in headless mode
+        "browser_type": "chromium",          # chromium/firefox/webkit
+        "user_agent": "custom-agent",        # Custom user agent
+        "proxy": "http://proxy:8080",        # Proxy configuration
+        
+        # Performance & Behavior
+        "page_timeout": 30000,               # Page load timeout (ms)
+        "verbose": True,                     # Enable detailed logging
+        "semaphore_count": 5,               # Concurrent request limit
+        
+        # Anti-Detection Features
+        "simulate_user": True,               # Simulate human behavior
+        "magic": True,                       # Advanced anti-detection
+        "override_navigator": True,          # Override navigator properties
+        
+        # Session Management
+        "user_data_dir": "./browser-data",   # Browser profile location
+        "use_managed_browser": True,         # Use persistent browser
+    }
+}
+```
+
+### Extra Parameters
+
+The `extra` field allows passing additional parameters directly to the crawler's `arun` function:
+
+```python
+request = {
+    "urls": "https://example.com",
+    "extra": {
+        "word_count_threshold": 10,          # Min words per block
+        "only_text": True,                   # Extract only text
+        "bypass_cache": True,                # Force fresh crawl
+        "process_iframes": True,             # Include iframe content
+    }
+}
+```
+
+### Complete Examples
+
+1. **Advanced News Crawling**
+```python
+request = {
+    "urls": "https://www.nbcnews.com/business",
+    "crawler_params": {
+        "headless": True,
+        "page_timeout": 30000,
+        "remove_overlay_elements": True      # Remove popups
+    },
+    "extra": {
+        "word_count_threshold": 50,          # Longer content blocks
+        "bypass_cache": True                 # Fresh content
+    },
+    "css_selector": ".article-body"
+}
+```
+
+2. **Anti-Detection Configuration**
+```python
+request = {
+    "urls": "https://example.com",
+    "crawler_params": {
+        "simulate_user": True,
+        "magic": True,
+        "override_navigator": True,
+        "user_agent": "Mozilla/5.0 ...",
+        "headers": {
+            "Accept-Language": "en-US,en;q=0.9"
+        }
+    }
+}
+```
+
+3. **LLM Extraction with Custom Parameters**
+```python
+request = {
+    "urls": "https://openai.com/pricing",
+    "extraction_config": {
+        "type": "llm",
+        "params": {
+            "provider": "openai/gpt-4",
+            "schema": pricing_schema
+        }
+    },
+    "crawler_params": {
+        "verbose": True,
+        "page_timeout": 60000
+    },
+    "extra": {
+        "word_count_threshold": 1,
+        "only_text": True
+    }
+}
+```
+
+4. **Session-Based Dynamic Content**
+```python
+request = {
+    "urls": "https://example.com",
+    "crawler_params": {
+        "session_id": "dynamic_session",
+        "headless": False,
+        "page_timeout": 60000
+    },
+    "js_code": ["window.scrollTo(0, document.body.scrollHeight);"],
+    "wait_for": "js:() => document.querySelectorAll('.item').length > 10",
+    "extra": {
+        "delay_before_return_html": 2.0
+    }
+}
+```
+
+5. **Screenshot with Custom Timing**
+```python
+request = {
+    "urls": "https://example.com",
+    "screenshot": True,
+    "crawler_params": {
+        "headless": True,
+        "screenshot_wait_for": ".main-content"
+    },
+    "extra": {
+        "delay_before_return_html": 3.0
+    }
+}
+```
+
+### Parameter Reference Table
+
+| Category | Parameter | Type | Description |
+|----------|-----------|------|-------------|
+| Browser | headless | bool | Run browser in headless mode |
+| Browser | browser_type | str | Browser engine selection |
+| Browser | user_agent | str | Custom user agent string |
+| Network | proxy | str | Proxy server URL |
+| Network | headers | dict | Custom HTTP headers |
+| Timing | page_timeout | int | Page load timeout (ms) |
+| Timing | delay_before_return_html | float | Wait before capture |
+| Anti-Detection | simulate_user | bool | Human behavior simulation |
+| Anti-Detection | magic | bool | Advanced protection |
+| Session | session_id | str | Browser session ID |
+| Session | user_data_dir | str | Profile directory |
+| Content | word_count_threshold | int | Minimum words per block |
+| Content | only_text | bool | Text-only extraction |
+| Content | process_iframes | bool | Include iframe content |
+| Debug | verbose | bool | Detailed logging |
+| Debug | log_console | bool | Browser console logs |
+
+## Troubleshooting 🔍
+
+### Common Issues
+
+1. **Connection Refused**
+   ```
+   Error: Connection refused at localhost:11235
+   ```
+   Solution: Ensure the container is running and ports are properly mapped.
+
+2. **Resource Limits**
+   ```
+   Error: No available slots
+   ```
+   Solution: Increase MAX_CONCURRENT_TASKS or container resources.
+
+3. **GPU Access**
+   ```
+   Error: GPU not found
+   ```
+   Solution: Ensure proper NVIDIA drivers and use `--gpus all` flag.
+
+### Debug Mode
+
+Access container for debugging:
+```bash
+docker run -it --entrypoint /bin/bash unclecode/crawl4ai:all
+```
+
+View container logs:
+```bash
+docker logs [container_id]
+```
+
+## Best Practices 🌟
+
+1. **Resource Management**
+   - Set appropriate memory and CPU limits
+   - Monitor resource usage via health endpoint
+   - Use basic version for simple crawling tasks
+
+2. **Scaling**
+   - Use multiple containers for high load
+   - Implement proper load balancing
+   - Monitor performance metrics
+
+3. **Security**
+   - Use environment variables for sensitive data
+   - Implement proper network isolation
+   - Regular security updates
+
+## API Reference 📚
+
+### Health Check
+```http
+GET /health
+```
+
+### Submit Crawl Task
+```http
+POST /crawl
+Content-Type: application/json
+
+{
+    "urls": "string or array",
+    "extraction_config": {
+        "type": "basic|llm|cosine|json_css",
+        "params": {}
+    },
+    "priority": 1-10,
+    "ttl": 3600
+}
+```
+
+### Get Task Status
+```http
+GET /task/{task_id}
+```
+
+For more details, visit the [official documentation](https://crawl4ai.com/mkdocs/).
--- a/docs/md_v2/index.md
+++ b/docs/md_v2/index.md
@@ -72,7 +72,7 @@ Our documentation is organized into several sections:
 ### Advanced Features
 - [Magic Mode](advanced/magic-mode.md)
 - [Session Management](advanced/session-management.md)
- [Hooks & Authentication](advanced/hooks.md)
+- [Hooks & Authentication](advanced/hooks-auth.md)
 - [Proxy & Security](advanced/proxy-security.md)
 - [Content Processing](advanced/content-processing.md)