From bcac486921bae0285e4b7a5592e424259adfccbd Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 22 Oct 2025 16:19:30 +0200 Subject: [PATCH] docs: enhance README and docker-deployment documentation with Job Queue and Webhook API details --- deploy/docker/README.md | 48 +++ docs/md_v2/core/docker-deployment.md | 466 +++++++++++++++++++++++++++ 2 files changed, 514 insertions(+) diff --git a/deploy/docker/README.md b/deploy/docker/README.md index 2c920ef1..cee8af7f 100644 --- a/deploy/docker/README.md +++ b/deploy/docker/README.md @@ -785,6 +785,54 @@ curl http://localhost:11235/crawl/job/crawl_xyz The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`. +#### LLM Extraction Jobs with Webhooks + +The same webhook system works for LLM extraction jobs via `/llm/job`: + +```bash +# Submit LLM extraction job with webhook +curl -X POST http://localhost:11235/llm/job \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://example.com/article", + "q": "Extract the article title, author, and main points", + "provider": "openai/gpt-4o-mini", + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/llm-complete", + "webhook_data_in_payload": true, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } + }' + +# Response: {"task_id": "llm_1234567890"} +``` + +**Your webhook receives:** +```json +{ + "task_id": "llm_1234567890", + "task_type": "llm_extraction", + "status": "completed", + "timestamp": "2025-10-22T12:30:00.000000+00:00", + "urls": ["https://example.com/article"], + "data": { + "extracted_content": { + "title": "Understanding Web Scraping", + "author": "John Doe", + "main_points": ["Point 1", "Point 2", "Point 3"] + } + } +} +``` + +**Key Differences for LLM Jobs:** +- Task type is `"llm_extraction"` instead of `"crawl"` +- Extracted data is in `data.extracted_content` +- Single URL only (not an array) +- Supports schema-based extraction with `schema` parameter + > 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling. --- diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md index ec2c69a5..6d5a9eae 100644 --- a/docs/md_v2/core/docker-deployment.md +++ b/docs/md_v2/core/docker-deployment.md @@ -27,6 +27,14 @@ - [Hook Response Information](#hook-response-information) - [Error Handling](#error-handling) - [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python) +- [Job Queue & Webhook API](#job-queue-webhook-api) + - [Why Use the Job Queue API?](#why-use-the-job-queue-api) + - [Available Endpoints](#available-endpoints) + - [Webhook Configuration](#webhook-configuration) + - [Usage Examples](#usage-examples) + - [Webhook Best Practices](#webhook-best-practices) + - [Use Cases](#use-cases) + - [Troubleshooting](#troubleshooting) - [Dockerfile Parameters](#dockerfile-parameters) - [Using the API](#using-the-api) - [Playground Interface](#playground-interface) @@ -1110,6 +1118,464 @@ if __name__ == "__main__": --- +## Job Queue & Webhook API + +The Docker deployment includes a powerful asynchronous job queue system with webhook support for both crawling and LLM extraction tasks. Instead of waiting for long-running operations to complete, submit jobs and receive real-time notifications via webhooks when they finish. + +### Why Use the Job Queue API? + +**Traditional Synchronous API (`/crawl`):** +- Client waits for entire crawl to complete +- Timeout issues with long-running crawls +- Resource blocking during execution +- Constant polling required for status updates + +**Asynchronous Job Queue API (`/crawl/job`, `/llm/job`):** +- ✅ Submit job and continue immediately +- ✅ No timeout concerns for long operations +- ✅ Real-time webhook notifications on completion +- ✅ Better resource utilization +- ✅ Perfect for batch processing +- ✅ Ideal for microservice architectures + +### Available Endpoints + +#### 1. Crawl Job Endpoint + +``` +POST /crawl/job +``` + +Submit an asynchronous crawl job with optional webhook notification. + +**Request Body:** +```json +{ + "urls": ["https://example.com"], + "cache_mode": "bypass", + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "schema": { + "title": "h1", + "content": ".article-body" + } + }, + "webhook_config": { + "webhook_url": "https://your-app.com/webhook/crawl-complete", + "webhook_data_in_payload": true, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token", + "X-Custom-Header": "value" + } + } +} +``` + +**Response:** +```json +{ + "task_id": "crawl_1698765432", + "message": "Crawl job submitted" +} +``` + +#### 2. LLM Extraction Job Endpoint + +``` +POST /llm/job +``` + +Submit an asynchronous LLM extraction job with optional webhook notification. + +**Request Body:** +```json +{ + "url": "https://example.com/article", + "q": "Extract the article title, author, publication date, and main points", + "provider": "openai/gpt-4o-mini", + "schema": "{\"title\": \"string\", \"author\": \"string\", \"date\": \"string\", \"points\": [\"string\"]}", + "cache": false, + "webhook_config": { + "webhook_url": "https://your-app.com/webhook/llm-complete", + "webhook_data_in_payload": true, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } +} +``` + +**Response:** +```json +{ + "task_id": "llm_1698765432", + "message": "LLM job submitted" +} +``` + +#### 3. Job Status Endpoint + +``` +GET /job/{task_id} +``` + +Check the status and retrieve results of a submitted job. + +**Response (In Progress):** +```json +{ + "task_id": "crawl_1698765432", + "status": "processing", + "message": "Job is being processed" +} +``` + +**Response (Completed):** +```json +{ + "task_id": "crawl_1698765432", + "status": "completed", + "result": { + "markdown": "# Page Title\n\nContent...", + "extracted_content": {...}, + "links": {...} + } +} +``` + +### Webhook Configuration + +Webhooks provide real-time notifications when your jobs complete, eliminating the need for constant polling. + +#### Webhook Config Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `webhook_url` | string | Yes | Your HTTP(S) endpoint to receive notifications | +| `webhook_data_in_payload` | boolean | No | Include full result data in webhook payload (default: false) | +| `webhook_headers` | object | No | Custom headers for authentication/identification | + +#### Webhook Payload Format + +**Success Notification (Crawl Job):** +```json +{ + "task_id": "crawl_1698765432", + "task_type": "crawl", + "status": "completed", + "timestamp": "2025-10-22T12:30:00.000000+00:00", + "urls": ["https://example.com"], + "data": { + "markdown": "# Page content...", + "extracted_content": {...}, + "links": {...} + } +} +``` + +**Success Notification (LLM Job):** +```json +{ + "task_id": "llm_1698765432", + "task_type": "llm_extraction", + "status": "completed", + "timestamp": "2025-10-22T12:30:00.000000+00:00", + "urls": ["https://example.com/article"], + "data": { + "extracted_content": { + "title": "Understanding Web Scraping", + "author": "John Doe", + "date": "2025-10-22", + "points": ["Point 1", "Point 2"] + } + } +} +``` + +**Failure Notification:** +```json +{ + "task_id": "crawl_1698765432", + "task_type": "crawl", + "status": "failed", + "timestamp": "2025-10-22T12:30:00.000000+00:00", + "urls": ["https://example.com"], + "error": "Connection timeout after 30 seconds" +} +``` + +#### Webhook Delivery & Retry + +- **Delivery Method:** HTTP POST to your `webhook_url` +- **Content-Type:** `application/json` +- **Retry Policy:** Exponential backoff with 5 attempts + - Attempt 1: Immediate + - Attempt 2: 1 second delay + - Attempt 3: 2 seconds delay + - Attempt 4: 4 seconds delay + - Attempt 5: 8 seconds delay +- **Success Status Codes:** 200-299 +- **Custom Headers:** Your `webhook_headers` are included in every request + +### Usage Examples + +#### Example 1: Python with Webhook Handler (Flask) + +```python +from flask import Flask, request, jsonify +import requests + +app = Flask(__name__) + +# Webhook handler +@app.route('/webhook/crawl-complete', methods=['POST']) +def handle_crawl_webhook(): + payload = request.json + + if payload['status'] == 'completed': + print(f"✅ Job {payload['task_id']} completed!") + print(f"Task type: {payload['task_type']}") + + # Access the crawl results + if 'data' in payload: + markdown = payload['data'].get('markdown', '') + extracted = payload['data'].get('extracted_content', {}) + print(f"Extracted {len(markdown)} characters") + print(f"Structured data: {extracted}") + else: + print(f"❌ Job {payload['task_id']} failed: {payload.get('error')}") + + return jsonify({"status": "received"}), 200 + +# Submit a crawl job with webhook +def submit_crawl_job(): + response = requests.post( + "http://localhost:11235/crawl/job", + json={ + "urls": ["https://example.com"], + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "schema": { + "name": "Example Schema", + "baseSelector": "body", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "description", "selector": "meta[name='description']", "type": "attribute", "attribute": "content"} + ] + } + }, + "webhook_config": { + "webhook_url": "https://your-app.com/webhook/crawl-complete", + "webhook_data_in_payload": True, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } + } + ) + + task_id = response.json()['task_id'] + print(f"Job submitted: {task_id}") + return task_id + +if __name__ == '__main__': + app.run(port=5000) +``` + +#### Example 2: LLM Extraction with Webhooks + +```python +import requests + +def submit_llm_job_with_webhook(): + response = requests.post( + "http://localhost:11235/llm/job", + json={ + "url": "https://example.com/article", + "q": "Extract the article title, author, and main points", + "provider": "openai/gpt-4o-mini", + "webhook_config": { + "webhook_url": "https://your-app.com/webhook/llm-complete", + "webhook_data_in_payload": True, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } + } + ) + + task_id = response.json()['task_id'] + print(f"LLM job submitted: {task_id}") + return task_id + +# Webhook handler for LLM jobs +@app.route('/webhook/llm-complete', methods=['POST']) +def handle_llm_webhook(): + payload = request.json + + if payload['status'] == 'completed': + extracted = payload['data']['extracted_content'] + print(f"✅ LLM extraction completed!") + print(f"Results: {extracted}") + else: + print(f"❌ LLM extraction failed: {payload.get('error')}") + + return jsonify({"status": "received"}), 200 +``` + +#### Example 3: Without Webhooks (Polling) + +If you don't use webhooks, you can poll for results: + +```python +import requests +import time + +# Submit job +response = requests.post( + "http://localhost:11235/crawl/job", + json={"urls": ["https://example.com"]} +) +task_id = response.json()['task_id'] + +# Poll for results +while True: + result = requests.get(f"http://localhost:11235/job/{task_id}") + data = result.json() + + if data['status'] == 'completed': + print("Job completed!") + print(data['result']) + break + elif data['status'] == 'failed': + print(f"Job failed: {data.get('error')}") + break + + print("Still processing...") + time.sleep(2) +``` + +#### Example 4: Global Webhook Configuration + +Set a default webhook URL in your `config.yml` to avoid repeating it in every request: + +```yaml +# config.yml +api: + crawler: + # ... other settings ... + webhook: + default_url: "https://your-app.com/webhook/default" + default_headers: + X-Webhook-Secret: "your-secret-token" +``` + +Then submit jobs without webhook config: + +```python +# Uses the global webhook configuration +response = requests.post( + "http://localhost:11235/crawl/job", + json={"urls": ["https://example.com"]} +) +``` + +### Webhook Best Practices + +1. **Authentication:** Always use custom headers for webhook authentication + ```json + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + ``` + +2. **Idempotency:** Design your webhook handler to be idempotent (safe to receive duplicate notifications) + +3. **Fast Response:** Return HTTP 200 quickly; process data asynchronously if needed + ```python + @app.route('/webhook', methods=['POST']) + def webhook(): + payload = request.json + # Queue for background processing + queue.enqueue(process_webhook, payload) + return jsonify({"status": "received"}), 200 + ``` + +4. **Error Handling:** Handle both success and failure notifications + ```python + if payload['status'] == 'completed': + # Process success + elif payload['status'] == 'failed': + # Log error, retry, or alert + ``` + +5. **Validation:** Verify webhook authenticity using custom headers + ```python + secret = request.headers.get('X-Webhook-Secret') + if secret != os.environ['EXPECTED_SECRET']: + return jsonify({"error": "Unauthorized"}), 401 + ``` + +6. **Logging:** Log webhook deliveries for debugging + ```python + logger.info(f"Webhook received: {payload['task_id']} - {payload['status']}") + ``` + +### Use Cases + +**1. Batch Processing** +Submit hundreds of URLs and get notified as each completes: +```python +urls = ["https://site1.com", "https://site2.com", ...] +for url in urls: + submit_crawl_job(url, webhook_url="https://app.com/webhook") +``` + +**2. Microservice Integration** +Integrate with event-driven architectures: +```python +# Service A submits job +task_id = submit_crawl_job(url) + +# Service B receives webhook and triggers next step +@app.route('/webhook') +def webhook(): + process_result(request.json) + trigger_next_service() + return "OK", 200 +``` + +**3. Long-Running Extractions** +Handle complex LLM extractions without timeouts: +```python +submit_llm_job( + url="https://long-article.com", + q="Comprehensive summary with key points and analysis", + webhook_url="https://app.com/webhook/llm" +) +``` + +### Troubleshooting + +**Webhook not receiving notifications?** +- Check your webhook URL is publicly accessible +- Verify firewall/security group settings +- Use webhook testing tools like webhook.site for debugging +- Check server logs for delivery attempts +- Ensure your handler returns 200-299 status code + +**Job stuck in processing?** +- Check Redis connection: `docker logs | grep redis` +- Verify worker processes: `docker exec ps aux | grep worker` +- Check server logs: `docker logs ` + +**Need to cancel a job?** +Jobs are processed asynchronously. If you need to cancel: +- Delete the task from Redis (requires Redis CLI access) +- Or implement a cancellation endpoint in your webhook handler + +--- + ## Dockerfile Parameters You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.