Merge branch 'implement-webhook-crawl-feature-011CULZY1Jy8N5MUkZqXkRVp' into develop

2025-10-22 13:12:25 +02:00
parent 6d1a398419 d670dcde0a
commit b71d624168
14 changed files with 2579 additions and 10 deletions
--- a/docs/examples/docker_webhook_example.py
+++ b/docs/examples/docker_webhook_example.py
@@ -0,0 +1,461 @@
+"""
+Docker Webhook Example for Crawl4AI
+
+This example demonstrates how to use webhooks with the Crawl4AI job queue API.
+Instead of polling for results, webhooks notify your application when jobs complete.
+
+Supports both:
+- /crawl/job - Raw crawling with markdown extraction
+- /llm/job - LLM-powered content extraction
+
+Prerequisites:
+1. Crawl4AI Docker container running on localhost:11234
+2. Flask installed: pip install flask requests
+3. LLM API key configured in .llm.env (for LLM extraction examples)
+
+Usage:
+1. Run this script: python docker_webhook_example.py
+2. The webhook server will start on http://localhost:8080
+3. Jobs will be submitted and webhooks will be received automatically
+"""
+
+import requests
+import json
+import time
+from flask import Flask, request, jsonify
+from threading import Thread
+
+# Configuration
+CRAWL4AI_BASE_URL = "http://localhost:11234"
+WEBHOOK_BASE_URL = "http://localhost:8080"  # Your webhook receiver URL
+
+# Initialize Flask app for webhook receiver
+app = Flask(__name__)
+
+# Store received webhook data for demonstration
+received_webhooks = []
+
+
+@app.route('/webhooks/crawl-complete', methods=['POST'])
+def handle_crawl_webhook():
+    """
+    Webhook handler that receives notifications when crawl jobs complete.
+
+    Payload structure:
+    {
+        "task_id": "crawl_abc123",
+        "task_type": "crawl",
+        "status": "completed" or "failed",
+        "timestamp": "2025-10-21T10:30:00.000000+00:00",
+        "urls": ["https://example.com"],
+        "error": "error message" (only if failed),
+        "data": {...} (only if webhook_data_in_payload=True)
+    }
+    """
+    payload = request.json
+    print(f"\n{'='*60}")
+    print(f"📬 Webhook received for task: {payload['task_id']}")
+    print(f"   Status: {payload['status']}")
+    print(f"   Timestamp: {payload['timestamp']}")
+    print(f"   URLs: {payload['urls']}")
+
+    if payload['status'] == 'completed':
+        # If data is in payload, process it directly
+        if 'data' in payload:
+            print(f"   ✅ Data included in webhook")
+            data = payload['data']
+            # Process the crawl results here
+            for result in data.get('results', []):
+                print(f"      - Crawled: {result.get('url')}")
+                print(f"      - Markdown length: {len(result.get('markdown', ''))}")
+        else:
+            # Fetch results from API if not included
+            print(f"   📥 Fetching results from API...")
+            task_id = payload['task_id']
+            result_response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
+            if result_response.ok:
+                data = result_response.json()
+                print(f"   ✅ Results fetched successfully")
+                # Process the crawl results here
+                for result in data['result'].get('results', []):
+                    print(f"      - Crawled: {result.get('url')}")
+                    print(f"      - Markdown length: {len(result.get('markdown', ''))}")
+
+    elif payload['status'] == 'failed':
+        print(f"   ❌ Job failed: {payload.get('error', 'Unknown error')}")
+
+    print(f"{'='*60}\n")
+
+    # Store webhook for demonstration
+    received_webhooks.append(payload)
+
+    # Return 200 OK to acknowledge receipt
+    return jsonify({"status": "received"}), 200
+
+
+@app.route('/webhooks/llm-complete', methods=['POST'])
+def handle_llm_webhook():
+    """
+    Webhook handler that receives notifications when LLM extraction jobs complete.
+
+    Payload structure:
+    {
+        "task_id": "llm_1698765432_12345",
+        "task_type": "llm_extraction",
+        "status": "completed" or "failed",
+        "timestamp": "2025-10-21T10:30:00.000000+00:00",
+        "urls": ["https://example.com/article"],
+        "error": "error message" (only if failed),
+        "data": {"extracted_content": {...}} (only if webhook_data_in_payload=True)
+    }
+    """
+    payload = request.json
+    print(f"\n{'='*60}")
+    print(f"🤖 LLM Webhook received for task: {payload['task_id']}")
+    print(f"   Task Type: {payload['task_type']}")
+    print(f"   Status: {payload['status']}")
+    print(f"   Timestamp: {payload['timestamp']}")
+    print(f"   URL: {payload['urls'][0]}")
+
+    if payload['status'] == 'completed':
+        # If data is in payload, process it directly
+        if 'data' in payload:
+            print(f"   ✅ Data included in webhook")
+            data = payload['data']
+            # Webhook wraps extracted content in 'extracted_content' field
+            extracted = data.get('extracted_content', {})
+            print(f"      - Extracted content:")
+            print(f"        {json.dumps(extracted, indent=8)}")
+        else:
+            # Fetch results from API if not included
+            print(f"   📥 Fetching results from API...")
+            task_id = payload['task_id']
+            result_response = requests.get(f"{CRAWL4AI_BASE_URL}/llm/job/{task_id}")
+            if result_response.ok:
+                data = result_response.json()
+                print(f"   ✅ Results fetched successfully")
+                # API returns unwrapped content in 'result' field
+                extracted = data['result']
+                print(f"      - Extracted content:")
+                print(f"        {json.dumps(extracted, indent=8)}")
+
+    elif payload['status'] == 'failed':
+        print(f"   ❌ Job failed: {payload.get('error', 'Unknown error')}")
+
+    print(f"{'='*60}\n")
+
+    # Store webhook for demonstration
+    received_webhooks.append(payload)
+
+    # Return 200 OK to acknowledge receipt
+    return jsonify({"status": "received"}), 200
+
+
+def start_webhook_server():
+    """Start the Flask webhook server in a separate thread"""
+    app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
+
+
+def submit_crawl_job_with_webhook(urls, webhook_url, include_data=False):
+    """
+    Submit a crawl job with webhook notification.
+
+    Args:
+        urls: List of URLs to crawl
+        webhook_url: URL to receive webhook notifications
+        include_data: Whether to include full results in webhook payload
+
+    Returns:
+        task_id: The job's task identifier
+    """
+    payload = {
+        "urls": urls,
+        "browser_config": {"headless": True},
+        "crawler_config": {"cache_mode": "bypass"},
+        "webhook_config": {
+            "webhook_url": webhook_url,
+            "webhook_data_in_payload": include_data,
+            # Optional: Add custom headers for authentication
+            # "webhook_headers": {
+            #     "X-Webhook-Secret": "your-secret-token"
+            # }
+        }
+    }
+
+    print(f"\n🚀 Submitting crawl job...")
+    print(f"   URLs: {urls}")
+    print(f"   Webhook: {webhook_url}")
+    print(f"   Include data: {include_data}")
+
+    response = requests.post(
+        f"{CRAWL4AI_BASE_URL}/crawl/job",
+        json=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    if response.ok:
+        data = response.json()
+        task_id = data['task_id']
+        print(f"   ✅ Job submitted successfully")
+        print(f"   Task ID: {task_id}")
+        return task_id
+    else:
+        print(f"   ❌ Failed to submit job: {response.text}")
+        return None
+
+
+def submit_llm_job_with_webhook(url, query, webhook_url, include_data=False, schema=None, provider=None):
+    """
+    Submit an LLM extraction job with webhook notification.
+
+    Args:
+        url: URL to extract content from
+        query: Instruction for the LLM (e.g., "Extract article title and author")
+        webhook_url: URL to receive webhook notifications
+        include_data: Whether to include full results in webhook payload
+        schema: Optional JSON schema for structured extraction
+        provider: Optional LLM provider (e.g., "openai/gpt-4o-mini")
+
+    Returns:
+        task_id: The job's task identifier
+    """
+    payload = {
+        "url": url,
+        "q": query,
+        "cache": False,
+        "webhook_config": {
+            "webhook_url": webhook_url,
+            "webhook_data_in_payload": include_data,
+            # Optional: Add custom headers for authentication
+            # "webhook_headers": {
+            #     "X-Webhook-Secret": "your-secret-token"
+            # }
+        }
+    }
+
+    if schema:
+        payload["schema"] = schema
+
+    if provider:
+        payload["provider"] = provider
+
+    print(f"\n🤖 Submitting LLM extraction job...")
+    print(f"   URL: {url}")
+    print(f"   Query: {query}")
+    print(f"   Webhook: {webhook_url}")
+    print(f"   Include data: {include_data}")
+    if provider:
+        print(f"   Provider: {provider}")
+
+    response = requests.post(
+        f"{CRAWL4AI_BASE_URL}/llm/job",
+        json=payload,
+        headers={"Content-Type": "application/json"}
+    )
+
+    if response.ok:
+        data = response.json()
+        task_id = data['task_id']
+        print(f"   ✅ Job submitted successfully")
+        print(f"   Task ID: {task_id}")
+        return task_id
+    else:
+        print(f"   ❌ Failed to submit job: {response.text}")
+        return None
+
+
+def submit_job_without_webhook(urls):
+    """
+    Submit a job without webhook (traditional polling approach).
+
+    Args:
+        urls: List of URLs to crawl
+
+    Returns:
+        task_id: The job's task identifier
+    """
+    payload = {
+        "urls": urls,
+        "browser_config": {"headless": True},
+        "crawler_config": {"cache_mode": "bypass"}
+    }
+
+    print(f"\n🚀 Submitting crawl job (without webhook)...")
+    print(f"   URLs: {urls}")
+
+    response = requests.post(
+        f"{CRAWL4AI_BASE_URL}/crawl/job",
+        json=payload
+    )
+
+    if response.ok:
+        data = response.json()
+        task_id = data['task_id']
+        print(f"   ✅ Job submitted successfully")
+        print(f"   Task ID: {task_id}")
+        return task_id
+    else:
+        print(f"   ❌ Failed to submit job: {response.text}")
+        return None
+
+
+def poll_job_status(task_id, timeout=60):
+    """
+    Poll for job status (used when webhook is not configured).
+
+    Args:
+        task_id: The job's task identifier
+        timeout: Maximum time to wait in seconds
+    """
+    print(f"\n⏳ Polling for job status...")
+    start_time = time.time()
+
+    while time.time() - start_time < timeout:
+        response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
+
+        if response.ok:
+            data = response.json()
+            status = data.get('status', 'unknown')
+
+            if status == 'completed':
+                print(f"   ✅ Job completed!")
+                return data
+            elif status == 'failed':
+                print(f"   ❌ Job failed: {data.get('error', 'Unknown error')}")
+                return data
+            else:
+                print(f"   ⏳ Status: {status}, waiting...")
+                time.sleep(2)
+        else:
+            print(f"   ❌ Failed to get status: {response.text}")
+            return None
+
+    print(f"   ⏰ Timeout reached")
+    return None
+
+
+def main():
+    """Run the webhook demonstration"""
+
+    # Check if Crawl4AI is running
+    try:
+        health = requests.get(f"{CRAWL4AI_BASE_URL}/health", timeout=5)
+        print(f"✅ Crawl4AI is running: {health.json()}")
+    except:
+        print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}")
+        print("   Please make sure Docker container is running:")
+        print("   docker run -d -p 11234:11234 --name crawl4ai unclecode/crawl4ai:latest")
+        return
+
+    # Start webhook server in background thread
+    print(f"\n🌐 Starting webhook server at {WEBHOOK_BASE_URL}...")
+    webhook_thread = Thread(target=start_webhook_server, daemon=True)
+    webhook_thread.start()
+    time.sleep(2)  # Give server time to start
+
+    # Example 1: Job with webhook (notification only, fetch data separately)
+    print(f"\n{'='*60}")
+    print("Example 1: Webhook Notification Only")
+    print(f"{'='*60}")
+    task_id_1 = submit_crawl_job_with_webhook(
+        urls=["https://example.com"],
+        webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete",
+        include_data=False
+    )
+
+    # Example 2: Job with webhook (data included in payload)
+    time.sleep(5)  # Wait a bit between requests
+    print(f"\n{'='*60}")
+    print("Example 2: Webhook with Full Data")
+    print(f"{'='*60}")
+    task_id_2 = submit_crawl_job_with_webhook(
+        urls=["https://www.python.org"],
+        webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete",
+        include_data=True
+    )
+
+    # Example 3: LLM extraction with webhook (notification only)
+    time.sleep(5)  # Wait a bit between requests
+    print(f"\n{'='*60}")
+    print("Example 3: LLM Extraction with Webhook (Notification Only)")
+    print(f"{'='*60}")
+    task_id_3 = submit_llm_job_with_webhook(
+        url="https://www.example.com",
+        query="Extract the main heading and description from this page.",
+        webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
+        include_data=False,
+        provider="openai/gpt-4o-mini"
+    )
+
+    # Example 4: LLM extraction with webhook (data included + schema)
+    time.sleep(5)  # Wait a bit between requests
+    print(f"\n{'='*60}")
+    print("Example 4: LLM Extraction with Schema and Full Data")
+    print(f"{'='*60}")
+
+    # Define a schema for structured extraction
+    schema = json.dumps({
+        "type": "object",
+        "properties": {
+            "title": {"type": "string", "description": "Page title"},
+            "description": {"type": "string", "description": "Page description"}
+        },
+        "required": ["title"]
+    })
+
+    task_id_4 = submit_llm_job_with_webhook(
+        url="https://www.python.org",
+        query="Extract the title and description of this website",
+        webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
+        include_data=True,
+        schema=schema,
+        provider="openai/gpt-4o-mini"
+    )
+
+    # Example 5: Traditional polling (no webhook)
+    time.sleep(5)  # Wait a bit between requests
+    print(f"\n{'='*60}")
+    print("Example 5: Traditional Polling (No Webhook)")
+    print(f"{'='*60}")
+    task_id_5 = submit_job_without_webhook(
+        urls=["https://github.com"]
+    )
+    if task_id_5:
+        result = poll_job_status(task_id_5)
+        if result and result.get('status') == 'completed':
+            print(f"   ✅ Results retrieved via polling")
+
+    # Wait for webhooks to arrive
+    print(f"\n⏳ Waiting for webhooks to be received...")
+    time.sleep(30)  # Give jobs time to complete and webhooks to arrive (longer for LLM)
+
+    # Summary
+    print(f"\n{'='*60}")
+    print("Summary")
+    print(f"{'='*60}")
+    print(f"Total webhooks received: {len(received_webhooks)}")
+
+    crawl_webhooks = [w for w in received_webhooks if w['task_type'] == 'crawl']
+    llm_webhooks = [w for w in received_webhooks if w['task_type'] == 'llm_extraction']
+
+    print(f"\n📊 Breakdown:")
+    print(f"   - Crawl webhooks: {len(crawl_webhooks)}")
+    print(f"   - LLM extraction webhooks: {len(llm_webhooks)}")
+
+    print(f"\n📋 Details:")
+    for i, webhook in enumerate(received_webhooks, 1):
+        task_type = webhook['task_type']
+        icon = "🕷️" if task_type == "crawl" else "🤖"
+        print(f"{i}. {icon} Task {webhook['task_id']}: {webhook['status']} ({task_type})")
+
+    print(f"\n✅ Demo completed!")
+    print(f"\n💡 Pro tips:")
+    print(f"   - In production, your webhook URL should be publicly accessible")
+    print(f"     (e.g., https://myapp.com/webhooks) or use ngrok for testing")
+    print(f"   - Both /crawl/job and /llm/job support the same webhook configuration")
+    print(f"   - Use webhook_data_in_payload=true to get results directly in the webhook")
+    print(f"   - LLM jobs may take longer, adjust timeouts accordingly")
+
+
+if __name__ == "__main__":
+    main()