feat: add webhook support for /llm/job endpoint

Add comprehensive webhook notification support for the /llm/job endpoint, following the same pattern as the existing /crawl/job implementation. Changes: - Add webhook_config field to LlmJobPayload model (job.py) - Implement webhook notifications in process_llm_extraction() with 4 notification points: success, provider validation failure, extraction failure, and general exceptions (api.py) - Store webhook_config in Redis task data for job tracking - Initialize WebhookDeliveryService with exponential backoff retry logic Documentation: - Add Example 6 to WEBHOOK_EXAMPLES.md showing LLM extraction with webhooks - Update Flask webhook handler to support both crawl and llm_extraction tasks - Add TypeScript client examples for LLM jobs - Add comprehensive examples to docker_webhook_example.py with schema support - Clarify data structure differences between webhook and API responses Testing: - Add test_llm_webhook_feature.py with 7 validation tests (all passing) - Verify pattern consistency with /crawl/job implementation - Add implementation guide (WEBHOOK_LLM_JOB_IMPLEMENTATION.md)
2025-10-22 13:03:09 +02:00
parent f8606f6865
commit d670dcde0a
6 changed files with 770 additions and 31 deletions
--- a/deploy/docker/WEBHOOK_EXAMPLES.md
+++ b/deploy/docker/WEBHOOK_EXAMPLES.md
@@ -164,9 +164,55 @@ curl -X POST http://localhost:11235/crawl/job \
 The webhook will be sent to the default URL configured in config.yml.
 ### Example 6: LLM Extraction Job with Webhook
 Use webhooks with the LLM extraction endpoint for asynchronous processing.
 **Request:**
 ```bash
 curl -X POST http://localhost:11235/llm/job \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://example.com/article",
    "q": "Extract the article title, author, and publication date",
    "schema": "{\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\"}, \"author\": {\"type\": \"string\"}, \"date\": {\"type\": \"string\"}}}",
    "cache": false,
    "provider": "openai/gpt-4o-mini",
    "webhook_config": {
      "webhook_url": "https://myapp.com/webhooks/llm-complete",
      "webhook_data_in_payload": true
    }
  }'
 ```
 **Response:**
 ```json
 {
  "task_id": "llm_1698765432_12345"
 }
 ```
 **Webhook Payload Received:**
 ```json
 {
  "task_id": "llm_1698765432_12345",
  "task_type": "llm_extraction",
  "status": "completed",
  "timestamp": "2025-10-21T10:30:00.000000+00:00",
  "urls": ["https://example.com/article"],
  "data": {
    "extracted_content": {
      "title": "Understanding Web Scraping",
      "author": "John Doe",
      "date": "2025-10-21"
    }
  }
 }
 ```
 ## Webhook Handler Example
-Here's a simple Python Flask webhook handler:
+Here's a simple Python Flask webhook handler that supports both crawl and LLM extraction jobs:
 ```python
 from flask import Flask, request, jsonify
@@ -179,23 +225,39 @@ def handle_crawl_webhook():
    payload = request.json
    task_id = payload['task_id']
    task_type = payload['task_type']
    status = payload['status']
    if status == 'completed':
        # If data not in payload, fetch it
        if 'data' not in payload:
-            response = requests.get(f'http://localhost:11235/crawl/job/{task_id}')
+            # Determine endpoint based on task type
            endpoint = 'crawl' if task_type == 'crawl' else 'llm'
            response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
            data = response.json()
        else:
            data = payload['data']
-        # Process the crawl data
+        # Process based on task type
-        print(f"Processing crawl results for {task_id}")
+        if task_type == 'crawl':
            print(f"Processing crawl results for {task_id}")
            # Handle crawl results
            results = data.get('results', [])
            for result in results:
                print(f"  - {result.get('url')}: {len(result.get('markdown', ''))} chars")
        elif task_type == 'llm_extraction':
            print(f"Processing LLM extraction for {task_id}")
            # Handle LLM extraction
            # Note: Webhook sends 'extracted_content', API returns 'result'
            extracted = data.get('extracted_content', data.get('result', {}))
            print(f"  - Extracted: {extracted}")
        # Your business logic here...
    elif status == 'failed':
        error = payload.get('error', 'Unknown error')
-        print(f"Crawl job {task_id} failed: {error}")
+        print(f"{task_type} job {task_id} failed: {error}")
        # Handle failure...
    return jsonify({"status": "received"}), 200
@@ -227,6 +289,7 @@ The webhook delivery service uses exponential backoff retry logic:
 4. **Flexible** - Choose between notification-only or full data delivery
 5. **Secure** - Support for custom headers for authentication
 6. **Configurable** - Global defaults or per-job configuration
 7. **Universal Support** - Works with both `/crawl/job` and `/llm/job` endpoints
 ## TypeScript Client Example
@@ -244,6 +307,15 @@ interface CrawlJobRequest {
  webhook_config?: WebhookConfig;
 }
 interface LLMJobRequest {
  url: string;
  q: string;
  schema?: string;
  cache?: boolean;
  provider?: string;
  webhook_config?: WebhookConfig;
 }
 async function createCrawlJob(request: CrawlJobRequest) {
  const response = await fetch('http://localhost:11235/crawl/job', {
    method: 'POST',
@@ -255,8 +327,19 @@ async function createCrawlJob(request: CrawlJobRequest) {
  return task_id;
 }
-// Usage
+async function createLLMJob(request: LLMJobRequest) {
-const taskId = await createCrawlJob({
+  const response = await fetch('http://localhost:11235/llm/job', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(request)
  });
  const { task_id } = await response.json();
  return task_id;
 }
 // Usage - Crawl Job
 const crawlTaskId = await createCrawlJob({
  urls: ['https://example.com'],
  webhook_config: {
    webhook_url: 'https://myapp.com/webhooks/crawl-complete',
@@ -266,6 +349,20 @@ const taskId = await createCrawlJob({
    }
  }
 });
 // Usage - LLM Extraction Job
 const llmTaskId = await createLLMJob({
  url: 'https://example.com/article',
  q: 'Extract the main points from this article',
  provider: 'openai/gpt-4o-mini',
  webhook_config: {
    webhook_url: 'https://myapp.com/webhooks/llm-complete',
    webhook_data_in_payload: true,
    webhook_headers: {
      'X-Webhook-Secret': 'my-secret'
    }
  }
 });
 ```
 ## Monitoring and Debugging
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -116,9 +116,13 @@ async def process_llm_extraction(
    instruction: str,
    schema: Optional[str] = None,
    cache: str = "0",
-    provider: Optional[str] = None
+    provider: Optional[str] = None,
    webhook_config: Optional[Dict] = None
 ) -> None:
    """Process LLM extraction in background."""
    # Initialize webhook service
    webhook_service = WebhookDeliveryService(config)
    try:
        # Validate provider
        is_valid, error_msg = validate_llm_provider(config, provider)
@@ -127,6 +131,16 @@ async def process_llm_extraction(
                "status": TaskStatus.FAILED,
                "error": error_msg
            })
            # Send webhook notification on failure
            await webhook_service.notify_job_completion(
                task_id=task_id,
                task_type="llm_extraction",
                status="failed",
                urls=[url],
                webhook_config=webhook_config,
                error=error_msg
            )
            return
        api_key = get_llm_api_key(config, provider)
        llm_strategy = LLMExtractionStrategy(
@@ -155,17 +169,40 @@ async def process_llm_extraction(
                "status": TaskStatus.FAILED,
                "error": result.error_message
            })
            # Send webhook notification on failure
            await webhook_service.notify_job_completion(
                task_id=task_id,
                task_type="llm_extraction",
                status="failed",
                urls=[url],
                webhook_config=webhook_config,
                error=result.error_message
            )
            return
        try:
            content = json.loads(result.extracted_content)
        except json.JSONDecodeError:
            content = result.extracted_content
        result_data = {"extracted_content": content}
        await redis.hset(f"task:{task_id}", mapping={
            "status": TaskStatus.COMPLETED,
            "result": json.dumps(content)
        })
        # Send webhook notification on successful completion
        await webhook_service.notify_job_completion(
            task_id=task_id,
            task_type="llm_extraction",
            status="completed",
            urls=[url],
            webhook_config=webhook_config,
            result=result_data
        )
    except Exception as e:
        logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
        await redis.hset(f"task:{task_id}", mapping={
@@ -173,6 +210,16 @@ async def process_llm_extraction(
            "error": str(e)
        })
        # Send webhook notification on failure
        await webhook_service.notify_job_completion(
            task_id=task_id,
            task_type="llm_extraction",
            status="failed",
            urls=[url],
            webhook_config=webhook_config,
            error=str(e)
        )
 async def handle_markdown_request(
    url: str,
    filter_type: FilterType,
@@ -249,7 +296,8 @@ async def handle_llm_request(
    schema: Optional[str] = None,
    cache: str = "0",
    config: Optional[dict] = None,
-    provider: Optional[str] = None
+    provider: Optional[str] = None,
    webhook_config: Optional[Dict] = None,
 ) -> JSONResponse:
    """Handle LLM extraction requests."""
    base_url = get_base_url(request)
@@ -280,7 +328,8 @@ async def handle_llm_request(
            cache,
            base_url,
            config,
-            provider
+            provider,
            webhook_config
        )
    except Exception as e:
@@ -325,7 +374,8 @@ async def create_new_task(
    cache: str,
    base_url: str,
    config: dict,
-    provider: Optional[str] = None
+    provider: Optional[str] = None,
    webhook_config: Optional[Dict] = None
 ) -> JSONResponse:
    """Create and initialize a new task."""
    decoded_url = unquote(input_path)
@@ -334,12 +384,18 @@ async def create_new_task(
    from datetime import datetime
    task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
-    
+
-    await redis.hset(f"task:{task_id}", mapping={
+    task_data = {
        "status": TaskStatus.PROCESSING,
        "created_at": datetime.now().isoformat(),
        "url": decoded_url
-    })
+    }
    # Store webhook config if provided
    if webhook_config:
        task_data["webhook_config"] = json.dumps(webhook_config)
    await redis.hset(f"task:{task_id}", mapping=task_data)
    background_tasks.add_task(
        process_llm_extraction,
@@ -350,7 +406,8 @@ async def create_new_task(
        query,
        schema,
        cache,
-        provider
+        provider,
        webhook_config
    )
    return JSONResponse({
--- a/deploy/docker/job.py
+++ b/deploy/docker/job.py
@@ -38,6 +38,7 @@ class LlmJobPayload(BaseModel):
    schema: Optional[str] = None
    cache:  bool = False
    provider: Optional[str] = None
    webhook_config: Optional[WebhookConfig] = None
 class CrawlJobPayload(BaseModel):
@@ -55,6 +56,10 @@ async def llm_job_enqueue(
        request: Request,
        _td: Dict = Depends(lambda: _token_dep()),   # late-bound dep
 ):
    webhook_config = None
    if payload.webhook_config:
        webhook_config = payload.webhook_config.model_dump(mode='json')
    return await handle_llm_request(
        _redis,
        background_tasks,
@@ -65,6 +70,7 @@ async def llm_job_enqueue(
        cache=payload.cache,
        config=_config,
        provider=payload.provider,
        webhook_config=webhook_config,
    )
@@ -74,7 +80,7 @@ async def llm_job_status(
    task_id: str,
    _td: Dict = Depends(lambda: _token_dep())
 ):
-    return await handle_task_status(_redis, task_id)
+    return await handle_task_status(_redis, task_id, base_url=str(request.base_url))
 # ---------- CRAWL job -------------------------------------------------------
--- a/docs/examples/docker_webhook_example.py
+++ b/docs/examples/docker_webhook_example.py
@@ -2,11 +2,16 @@
 Docker Webhook Example for Crawl4AI
 This example demonstrates how to use webhooks with the Crawl4AI job queue API.
-Instead of polling for results, webhooks notify your application when crawls complete.
+Instead of polling for results, webhooks notify your application when jobs complete.
 Supports both:
 - /crawl/job - Raw crawling with markdown extraction
 - /llm/job - LLM-powered content extraction
 Prerequisites:
-1. Crawl4AI Docker container running on localhost:11235
+1. Crawl4AI Docker container running on localhost:11234
 2. Flask installed: pip install flask requests
 3. LLM API key configured in .llm.env (for LLM extraction examples)
 Usage:
 1. Run this script: python docker_webhook_example.py
@@ -21,7 +26,7 @@ from flask import Flask, request, jsonify
 from threading import Thread
 # Configuration
-CRAWL4AI_BASE_URL = "http://localhost:11235"
+CRAWL4AI_BASE_URL = "http://localhost:11234"
 WEBHOOK_BASE_URL = "http://localhost:8080"  # Your webhook receiver URL
 # Initialize Flask app for webhook receiver
@@ -88,6 +93,64 @@ def handle_crawl_webhook():
    return jsonify({"status": "received"}), 200
@app.route('/webhooks/llm-complete', methods=['POST'])
 def handle_llm_webhook():
    """
    Webhook handler that receives notifications when LLM extraction jobs complete.
    Payload structure:
    {
        "task_id": "llm_1698765432_12345",
        "task_type": "llm_extraction",
        "status": "completed" or "failed",
        "timestamp": "2025-10-21T10:30:00.000000+00:00",
        "urls": ["https://example.com/article"],
        "error": "error message" (only if failed),
        "data": {"extracted_content": {...}} (only if webhook_data_in_payload=True)
    }
    """
    payload = request.json
    print(f"\n{'='*60}")
    print(f"🤖 LLM Webhook received for task: {payload['task_id']}")
    print(f"   Task Type: {payload['task_type']}")
    print(f"   Status: {payload['status']}")
    print(f"   Timestamp: {payload['timestamp']}")
    print(f"   URL: {payload['urls'][0]}")
    if payload['status'] == 'completed':
        # If data is in payload, process it directly
        if 'data' in payload:
            print(f"   ✅ Data included in webhook")
            data = payload['data']
            # Webhook wraps extracted content in 'extracted_content' field
            extracted = data.get('extracted_content', {})
            print(f"      - Extracted content:")
            print(f"        {json.dumps(extracted, indent=8)}")
        else:
            # Fetch results from API if not included
            print(f"   📥 Fetching results from API...")
            task_id = payload['task_id']
            result_response = requests.get(f"{CRAWL4AI_BASE_URL}/llm/job/{task_id}")
            if result_response.ok:
                data = result_response.json()
                print(f"   ✅ Results fetched successfully")
                # API returns unwrapped content in 'result' field
                extracted = data['result']
                print(f"      - Extracted content:")
                print(f"        {json.dumps(extracted, indent=8)}")
    elif payload['status'] == 'failed':
        print(f"   ❌ Job failed: {payload.get('error', 'Unknown error')}")
    print(f"{'='*60}\n")
    # Store webhook for demonstration
    received_webhooks.append(payload)
    # Return 200 OK to acknowledge receipt
    return jsonify({"status": "received"}), 200
 def start_webhook_server():
    """Start the Flask webhook server in a separate thread"""
    app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
@@ -141,6 +204,66 @@ def submit_crawl_job_with_webhook(urls, webhook_url, include_data=False):
        return None
 def submit_llm_job_with_webhook(url, query, webhook_url, include_data=False, schema=None, provider=None):
    """
    Submit an LLM extraction job with webhook notification.
    Args:
        url: URL to extract content from
        query: Instruction for the LLM (e.g., "Extract article title and author")
        webhook_url: URL to receive webhook notifications
        include_data: Whether to include full results in webhook payload
        schema: Optional JSON schema for structured extraction
        provider: Optional LLM provider (e.g., "openai/gpt-4o-mini")
    Returns:
        task_id: The job's task identifier
    """
    payload = {
        "url": url,
        "q": query,
        "cache": False,
        "webhook_config": {
            "webhook_url": webhook_url,
            "webhook_data_in_payload": include_data,
            # Optional: Add custom headers for authentication
            # "webhook_headers": {
            #     "X-Webhook-Secret": "your-secret-token"
            # }
        }
    }
    if schema:
        payload["schema"] = schema
    if provider:
        payload["provider"] = provider
    print(f"\n🤖 Submitting LLM extraction job...")
    print(f"   URL: {url}")
    print(f"   Query: {query}")
    print(f"   Webhook: {webhook_url}")
    print(f"   Include data: {include_data}")
    if provider:
        print(f"   Provider: {provider}")
    response = requests.post(
        f"{CRAWL4AI_BASE_URL}/llm/job",
        json=payload,
        headers={"Content-Type": "application/json"}
    )
    if response.ok:
        data = response.json()
        task_id = data['task_id']
        print(f"   ✅ Job submitted successfully")
        print(f"   Task ID: {task_id}")
        return task_id
    else:
        print(f"   ❌ Failed to submit job: {response.text}")
        return None
 def submit_job_without_webhook(urls):
    """
    Submit a job without webhook (traditional polling approach).
@@ -221,7 +344,7 @@ def main():
    except:
        print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}")
        print("   Please make sure Docker container is running:")
-        print("   docker run -d -p 11235:11235 --name crawl4ai unclecode/crawl4ai:latest")
+        print("   docker run -d -p 11234:11234 --name crawl4ai unclecode/crawl4ai:latest")
        return
    # Start webhook server in background thread
@@ -251,34 +374,87 @@ def main():
        include_data=True
    )
-    # Example 3: Traditional polling (no webhook)
+    # Example 3: LLM extraction with webhook (notification only)
    time.sleep(5)  # Wait a bit between requests
    print(f"\n{'='*60}")
-    print("Example 3: Traditional Polling (No Webhook)")
+    print("Example 3: LLM Extraction with Webhook (Notification Only)")
    print(f"{'='*60}")
-    task_id_3 = submit_job_without_webhook(
+    task_id_3 = submit_llm_job_with_webhook(
        url="https://www.example.com",
        query="Extract the main heading and description from this page.",
        webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
        include_data=False,
        provider="openai/gpt-4o-mini"
    )
    # Example 4: LLM extraction with webhook (data included + schema)
    time.sleep(5)  # Wait a bit between requests
    print(f"\n{'='*60}")
    print("Example 4: LLM Extraction with Schema and Full Data")
    print(f"{'='*60}")
    # Define a schema for structured extraction
    schema = json.dumps({
        "type": "object",
        "properties": {
            "title": {"type": "string", "description": "Page title"},
            "description": {"type": "string", "description": "Page description"}
        },
        "required": ["title"]
    })
    task_id_4 = submit_llm_job_with_webhook(
        url="https://www.python.org",
        query="Extract the title and description of this website",
        webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
        include_data=True,
        schema=schema,
        provider="openai/gpt-4o-mini"
    )
    # Example 5: Traditional polling (no webhook)
    time.sleep(5)  # Wait a bit between requests
    print(f"\n{'='*60}")
    print("Example 5: Traditional Polling (No Webhook)")
    print(f"{'='*60}")
    task_id_5 = submit_job_without_webhook(
        urls=["https://github.com"]
    )
-    if task_id_3:
+    if task_id_5:
-        result = poll_job_status(task_id_3)
+        result = poll_job_status(task_id_5)
        if result and result.get('status') == 'completed':
            print(f"   ✅ Results retrieved via polling")
    # Wait for webhooks to arrive
    print(f"\n⏳ Waiting for webhooks to be received...")
-    time.sleep(20)  # Give jobs time to complete and webhooks to arrive
+    time.sleep(30)  # Give jobs time to complete and webhooks to arrive (longer for LLM)
    # Summary
    print(f"\n{'='*60}")
    print("Summary")
    print(f"{'='*60}")
    print(f"Total webhooks received: {len(received_webhooks)}")
    crawl_webhooks = [w for w in received_webhooks if w['task_type'] == 'crawl']
    llm_webhooks = [w for w in received_webhooks if w['task_type'] == 'llm_extraction']
    print(f"\n📊 Breakdown:")
    print(f"   - Crawl webhooks: {len(crawl_webhooks)}")
    print(f"   - LLM extraction webhooks: {len(llm_webhooks)}")
    print(f"\n📋 Details:")
    for i, webhook in enumerate(received_webhooks, 1):
-        print(f"{i}. Task {webhook['task_id']}: {webhook['status']}")
+        task_type = webhook['task_type']
        icon = "🕷️" if task_type == "crawl" else "🤖"
        print(f"{i}. {icon} Task {webhook['task_id']}: {webhook['status']} ({task_type})")
    print(f"\n✅ Demo completed!")
-    print(f"\n💡 Pro tip: In production, your webhook URL should be publicly accessible")
+    print(f"\n💡 Pro tips:")
-    print(f"   (e.g., https://myapp.com/webhooks/crawl) or use a service like ngrok for testing.")
+    print(f"   - In production, your webhook URL should be publicly accessible")
    print(f"     (e.g., https://myapp.com/webhooks) or use ngrok for testing")
    print(f"   - Both /crawl/job and /llm/job support the same webhook configuration")
    print(f"   - Use webhook_data_in_payload=true to get results directly in the webhook")
    print(f"   - LLM jobs may take longer, adjust timeouts accordingly")
 if __name__ == "__main__":
--- a/test_llm_webhook_feature.py
+++ b/test_llm_webhook_feature.py
@@ -0,0 +1,401 @@
 #!/usr/bin/env python3
 """
 Test script to validate webhook implementation for /llm/job endpoint.
 This tests that the /llm/job endpoint now supports webhooks
 following the same pattern as /crawl/job.
 """
 import sys
 import os
 # Add deploy/docker to path
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'deploy', 'docker'))
 def test_llm_job_payload_model():
    """Test that LlmJobPayload includes webhook_config field"""
    print("=" * 60)
    print("TEST 1: LlmJobPayload Model")
    print("=" * 60)
    try:
        from job import LlmJobPayload
        from schemas import WebhookConfig
        from pydantic import ValidationError
        # Test with webhook_config
        payload_dict = {
            "url": "https://example.com",
            "q": "Extract main content",
            "schema": None,
            "cache": False,
            "provider": None,
            "webhook_config": {
                "webhook_url": "https://myapp.com/webhook",
                "webhook_data_in_payload": True,
                "webhook_headers": {"X-Secret": "token"}
            }
        }
        payload = LlmJobPayload(**payload_dict)
        print(f"✅ LlmJobPayload accepts webhook_config")
        print(f"   - URL: {payload.url}")
        print(f"   - Query: {payload.q}")
        print(f"   - Webhook URL: {payload.webhook_config.webhook_url}")
        print(f"   - Data in payload: {payload.webhook_config.webhook_data_in_payload}")
        # Test without webhook_config (should be optional)
        minimal_payload = {
            "url": "https://example.com",
            "q": "Extract content"
        }
        payload2 = LlmJobPayload(**minimal_payload)
        assert payload2.webhook_config is None, "webhook_config should be optional"
        print(f"✅ LlmJobPayload works without webhook_config (optional)")
        return True
    except Exception as e:
        print(f"❌ Failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 def test_handle_llm_request_signature():
    """Test that handle_llm_request accepts webhook_config parameter"""
    print("\n" + "=" * 60)
    print("TEST 2: handle_llm_request Function Signature")
    print("=" * 60)
    try:
        from api import handle_llm_request
        import inspect
        sig = inspect.signature(handle_llm_request)
        params = list(sig.parameters.keys())
        print(f"Function parameters: {params}")
        if 'webhook_config' in params:
            print(f"✅ handle_llm_request has webhook_config parameter")
            # Check that it's optional with default None
            webhook_param = sig.parameters['webhook_config']
            if webhook_param.default is None or webhook_param.default == inspect.Parameter.empty:
                print(f"✅ webhook_config is optional (default: {webhook_param.default})")
            else:
                print(f"⚠️  webhook_config default is: {webhook_param.default}")
            return True
        else:
            print(f"❌ handle_llm_request missing webhook_config parameter")
            return False
    except Exception as e:
        print(f"❌ Failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 def test_process_llm_extraction_signature():
    """Test that process_llm_extraction accepts webhook_config parameter"""
    print("\n" + "=" * 60)
    print("TEST 3: process_llm_extraction Function Signature")
    print("=" * 60)
    try:
        from api import process_llm_extraction
        import inspect
        sig = inspect.signature(process_llm_extraction)
        params = list(sig.parameters.keys())
        print(f"Function parameters: {params}")
        if 'webhook_config' in params:
            print(f"✅ process_llm_extraction has webhook_config parameter")
            webhook_param = sig.parameters['webhook_config']
            if webhook_param.default is None or webhook_param.default == inspect.Parameter.empty:
                print(f"✅ webhook_config is optional (default: {webhook_param.default})")
            else:
                print(f"⚠️  webhook_config default is: {webhook_param.default}")
            return True
        else:
            print(f"❌ process_llm_extraction missing webhook_config parameter")
            return False
    except Exception as e:
        print(f"❌ Failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 def test_webhook_integration_in_api():
    """Test that api.py properly integrates webhook notifications"""
    print("\n" + "=" * 60)
    print("TEST 4: Webhook Integration in process_llm_extraction")
    print("=" * 60)
    try:
        api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
        with open(api_file, 'r') as f:
            api_content = f.read()
        # Check for WebhookDeliveryService initialization
        if 'webhook_service = WebhookDeliveryService(config)' in api_content:
            print("✅ process_llm_extraction initializes WebhookDeliveryService")
        else:
            print("❌ Missing WebhookDeliveryService initialization in process_llm_extraction")
            return False
        # Check for notify_job_completion calls with llm_extraction
        if 'task_type="llm_extraction"' in api_content:
            print("✅ Uses correct task_type='llm_extraction' for notifications")
        else:
            print("❌ Missing task_type='llm_extraction' in webhook notifications")
            return False
        # Count webhook notification calls (should have at least 3: success + 2 failure paths)
        notification_count = api_content.count('await webhook_service.notify_job_completion')
        # Find only in process_llm_extraction function
        llm_func_start = api_content.find('async def process_llm_extraction')
        llm_func_end = api_content.find('\nasync def ', llm_func_start + 1)
        if llm_func_end == -1:
            llm_func_end = len(api_content)
        llm_func_content = api_content[llm_func_start:llm_func_end]
        llm_notification_count = llm_func_content.count('await webhook_service.notify_job_completion')
        print(f"✅ Found {llm_notification_count} webhook notification calls in process_llm_extraction")
        if llm_notification_count >= 3:
            print(f"✅ Sufficient notification points (success + failure paths)")
        else:
            print(f"⚠️  Expected at least 3 notification calls, found {llm_notification_count}")
        return True
    except Exception as e:
        print(f"❌ Failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 def test_job_endpoint_integration():
    """Test that /llm/job endpoint extracts and passes webhook_config"""
    print("\n" + "=" * 60)
    print("TEST 5: /llm/job Endpoint Integration")
    print("=" * 60)
    try:
        job_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'job.py')
        with open(job_file, 'r') as f:
            job_content = f.read()
        # Find the llm_job_enqueue function
        llm_job_start = job_content.find('async def llm_job_enqueue')
        llm_job_end = job_content.find('\n\n@router', llm_job_start + 1)
        if llm_job_end == -1:
            llm_job_end = job_content.find('\n\nasync def', llm_job_start + 1)
        llm_job_func = job_content[llm_job_start:llm_job_end]
        # Check for webhook_config extraction
        if 'webhook_config = None' in llm_job_func:
            print("✅ llm_job_enqueue initializes webhook_config variable")
        else:
            print("❌ Missing webhook_config initialization")
            return False
        if 'if payload.webhook_config:' in llm_job_func:
            print("✅ llm_job_enqueue checks for payload.webhook_config")
        else:
            print("❌ Missing webhook_config check")
            return False
        if 'webhook_config = payload.webhook_config.model_dump(mode=\'json\')' in llm_job_func:
            print("✅ llm_job_enqueue converts webhook_config to dict")
        else:
            print("❌ Missing webhook_config.model_dump conversion")
            return False
        if 'webhook_config=webhook_config' in llm_job_func:
            print("✅ llm_job_enqueue passes webhook_config to handle_llm_request")
        else:
            print("❌ Missing webhook_config parameter in handle_llm_request call")
            return False
        return True
    except Exception as e:
        print(f"❌ Failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 def test_create_new_task_integration():
    """Test that create_new_task stores webhook_config in Redis"""
    print("\n" + "=" * 60)
    print("TEST 6: create_new_task Webhook Storage")
    print("=" * 60)
    try:
        api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
        with open(api_file, 'r') as f:
            api_content = f.read()
        # Find create_new_task function
        create_task_start = api_content.find('async def create_new_task')
        create_task_end = api_content.find('\nasync def ', create_task_start + 1)
        if create_task_end == -1:
            create_task_end = len(api_content)
        create_task_func = api_content[create_task_start:create_task_end]
        # Check for webhook_config storage
        if 'if webhook_config:' in create_task_func:
            print("✅ create_new_task checks for webhook_config")
        else:
            print("❌ Missing webhook_config check in create_new_task")
            return False
        if 'task_data["webhook_config"] = json.dumps(webhook_config)' in create_task_func:
            print("✅ create_new_task stores webhook_config in Redis task data")
        else:
            print("❌ Missing webhook_config storage in task_data")
            return False
        # Check that webhook_config is passed to process_llm_extraction
        if 'webhook_config' in create_task_func and 'background_tasks.add_task' in create_task_func:
            print("✅ create_new_task passes webhook_config to background task")
        else:
            print("⚠️  Could not verify webhook_config passed to background task")
        return True
    except Exception as e:
        print(f"❌ Failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 def test_pattern_consistency():
    """Test that /llm/job follows the same pattern as /crawl/job"""
    print("\n" + "=" * 60)
    print("TEST 7: Pattern Consistency with /crawl/job")
    print("=" * 60)
    try:
        api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
        with open(api_file, 'r') as f:
            api_content = f.read()
        # Find handle_crawl_job to compare pattern
        crawl_job_start = api_content.find('async def handle_crawl_job')
        crawl_job_end = api_content.find('\nasync def ', crawl_job_start + 1)
        if crawl_job_end == -1:
            crawl_job_end = len(api_content)
        crawl_job_func = api_content[crawl_job_start:crawl_job_end]
        # Find process_llm_extraction
        llm_extract_start = api_content.find('async def process_llm_extraction')
        llm_extract_end = api_content.find('\nasync def ', llm_extract_start + 1)
        if llm_extract_end == -1:
            llm_extract_end = len(api_content)
        llm_extract_func = api_content[llm_extract_start:llm_extract_end]
        print("Checking pattern consistency...")
        # Both should initialize WebhookDeliveryService
        crawl_has_service = 'webhook_service = WebhookDeliveryService(config)' in crawl_job_func
        llm_has_service = 'webhook_service = WebhookDeliveryService(config)' in llm_extract_func
        if crawl_has_service and llm_has_service:
            print("✅ Both initialize WebhookDeliveryService")
        else:
            print(f"❌ Service initialization mismatch (crawl: {crawl_has_service}, llm: {llm_has_service})")
            return False
        # Both should call notify_job_completion on success
        crawl_notifies_success = 'status="completed"' in crawl_job_func and 'notify_job_completion' in crawl_job_func
        llm_notifies_success = 'status="completed"' in llm_extract_func and 'notify_job_completion' in llm_extract_func
        if crawl_notifies_success and llm_notifies_success:
            print("✅ Both notify on success")
        else:
            print(f"❌ Success notification mismatch (crawl: {crawl_notifies_success}, llm: {llm_notifies_success})")
            return False
        # Both should call notify_job_completion on failure
        crawl_notifies_failure = 'status="failed"' in crawl_job_func and 'error=' in crawl_job_func
        llm_notifies_failure = 'status="failed"' in llm_extract_func and 'error=' in llm_extract_func
        if crawl_notifies_failure and llm_notifies_failure:
            print("✅ Both notify on failure")
        else:
            print(f"❌ Failure notification mismatch (crawl: {crawl_notifies_failure}, llm: {llm_notifies_failure})")
            return False
        print("✅ /llm/job follows the same pattern as /crawl/job")
        return True
    except Exception as e:
        print(f"❌ Failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 def main():
    """Run all tests"""
    print("\n🧪 LLM Job Webhook Feature Validation")
    print("=" * 60)
    print("Testing that /llm/job now supports webhooks like /crawl/job")
    print("=" * 60 + "\n")
    results = []
    # Run all tests
    results.append(("LlmJobPayload Model", test_llm_job_payload_model()))
    results.append(("handle_llm_request Signature", test_handle_llm_request_signature()))
    results.append(("process_llm_extraction Signature", test_process_llm_extraction_signature()))
    results.append(("Webhook Integration", test_webhook_integration_in_api()))
    results.append(("/llm/job Endpoint", test_job_endpoint_integration()))
    results.append(("create_new_task Storage", test_create_new_task_integration()))
    results.append(("Pattern Consistency", test_pattern_consistency()))
    # Print summary
    print("\n" + "=" * 60)
    print("TEST SUMMARY")
    print("=" * 60)
    passed = sum(1 for _, result in results if result)
    total = len(results)
    for test_name, result in results:
        status = "✅ PASS" if result else "❌ FAIL"
        print(f"{status} - {test_name}")
    print(f"\n{'=' * 60}")
    print(f"Results: {passed}/{total} tests passed")
    print(f"{'=' * 60}")
    if passed == total:
        print("\n🎉 All tests passed! /llm/job webhook feature is correctly implemented.")
        print("\n📝 Summary of changes:")
        print("  1. LlmJobPayload model includes webhook_config field")
        print("  2. /llm/job endpoint extracts and passes webhook_config")
        print("  3. handle_llm_request accepts webhook_config parameter")
        print("  4. create_new_task stores webhook_config in Redis")
        print("  5. process_llm_extraction sends webhook notifications")
        print("  6. Follows the same pattern as /crawl/job")
        return 0
    else:
        print(f"\n⚠️  {total - passed} test(s) failed. Please review the output above.")
        return 1
 if __name__ == "__main__":
    exit(main())
--- a/test_webhook_implementation.py
+++ b/test_webhook_implementation.py
@@ -14,7 +14,8 @@ import json
 from datetime import datetime, timezone
 # Add deploy/docker to path to import modules
-sys.path.insert(0, '/home/user/crawl4ai/deploy/docker')
+# sys.path.insert(0, '/home/user/crawl4ai/deploy/docker')
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'deploy', 'docker'))
 def test_imports():
    """Test that all webhook-related modules can be imported"""
@@ -237,7 +238,8 @@ def test_api_integration():
    try:
        # Check if api.py can import webhook module
-        with open('/home/user/crawl4ai/deploy/docker/api.py', 'r') as f:
+        api_path = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
        with open(api_path, 'r') as f:
            api_content = f.read()
        if 'from webhook import WebhookDeliveryService' in api_content: