diff --git a/Dockerfile b/Dockerfile index 1267578c..33c33d55 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM python:3.12-slim-bookworm AS build # C4ai version -ARG C4AI_VER=0.7.0-r1 +ARG C4AI_VER=0.7.6 ENV C4AI_VERSION=$C4AI_VER LABEL c4ai.version=$C4AI_VER diff --git a/README.md b/README.md index 431c6ba0..d9a68482 100644 --- a/README.md +++ b/README.md @@ -27,13 +27,13 @@ Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community. -[✨ Check out latest update v0.7.5](#-recent-updates) +[✨ Check out latest update v0.7.6](#-recent-updates) -✨ New in v0.7.5: Docker Hooks System with function-based API for pipeline customization, Enhanced LLM Integration with custom providers, HTTPS Preservation, and multiple community-reported bug fixes. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md) +✨ **New in v0.7.6**: Complete Webhook Infrastructure for Docker Job Queue API! Real-time notifications for both `/crawl/job` and `/llm/job` endpoints with exponential backoff retry, custom headers, and flexible delivery modes. No more polling! [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.6.md) -✨ Recent v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md) +✨ Recent v0.7.5: Docker Hooks System with function-based API for pipeline customization, Enhanced LLM Integration with custom providers, HTTPS Preservation, and multiple community-reported bug fixes. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md) -✨ Previous v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md) +✨ Previous v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
🤓 My Personal Story diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 550c1e08..36be3827 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,7 +1,7 @@ # crawl4ai/__version__.py # This is the version that will be used for stable releases -__version__ = "0.7.5" +__version__ = "0.7.6" # For nightly builds, this gets set during build process __nightly_version__ = None diff --git a/deploy/docker/README.md b/deploy/docker/README.md index 9206250e..2c920ef1 100644 --- a/deploy/docker/README.md +++ b/deploy/docker/README.md @@ -59,15 +59,13 @@ Pull and run images directly from Docker Hub without building locally. #### 1. Pull the Image -Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system. - -> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features. +Our latest stable release is `0.7.6`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system. ```bash -# Pull the release candidate (for testing new features) -docker pull unclecode/crawl4ai:0.7.0-r1 +# Pull the latest stable version (0.7.6) +docker pull unclecode/crawl4ai:0.7.6 -# Or pull the current stable version (0.6.0) +# Or use the latest tag (points to 0.7.6) docker pull unclecode/crawl4ai:latest ``` @@ -102,7 +100,7 @@ EOL -p 11235:11235 \ --name crawl4ai \ --shm-size=1g \ - unclecode/crawl4ai:0.7.0-r1 + unclecode/crawl4ai:0.7.6 ``` * **With LLM support:** @@ -113,7 +111,7 @@ EOL --name crawl4ai \ --env-file .llm.env \ --shm-size=1g \ - unclecode/crawl4ai:0.7.0-r1 + unclecode/crawl4ai:0.7.6 ``` > The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface. @@ -186,7 +184,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach ```bash # Pulls and runs the release candidate from Docker Hub # Automatically selects the correct architecture - IMAGE=unclecode/crawl4ai:0.7.0-r1 docker compose up -d + IMAGE=unclecode/crawl4ai:0.7.6 docker compose up -d ``` * **Build and Run Locally:** diff --git a/docs/blog/release-v0.7.6.md b/docs/blog/release-v0.7.6.md new file mode 100644 index 00000000..e27d19cc --- /dev/null +++ b/docs/blog/release-v0.7.6.md @@ -0,0 +1,314 @@ +# Crawl4AI v0.7.6 Release Notes + +*Release Date: October 22, 2025* + +I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows. + +## 🎯 What's New + +### Webhook Support for Docker Job Queue API + +The headline feature of v0.7.6 is comprehensive webhook support for asynchronous job processing. No more constant polling to check if your jobs are done - get instant notifications when they complete! + +**Key Capabilities:** + +- ✅ **Universal Webhook Support**: Both `/crawl/job` and `/llm/job` endpoints now support webhooks +- ✅ **Flexible Delivery Modes**: Choose notification-only or include full data in the webhook payload +- ✅ **Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s) +- ✅ **Custom Authentication**: Add custom headers for webhook authentication +- ✅ **Global Configuration**: Set default webhook URL in `config.yml` for all jobs +- ✅ **Task Type Identification**: Distinguish between `crawl` and `llm_extraction` tasks + +### How It Works + +Instead of constantly checking job status: + +**OLD WAY (Polling):** +```python +# Submit job +response = requests.post("http://localhost:11235/crawl/job", json=payload) +task_id = response.json()['task_id'] + +# Poll until complete +while True: + status = requests.get(f"http://localhost:11235/crawl/job/{task_id}") + if status.json()['status'] == 'completed': + break + time.sleep(5) # Wait and try again +``` + +**NEW WAY (Webhooks):** +```python +# Submit job with webhook +payload = { + "urls": ["https://example.com"], + "webhook_config": { + "webhook_url": "https://myapp.com/webhook", + "webhook_data_in_payload": True + } +} +response = requests.post("http://localhost:11235/crawl/job", json=payload) + +# Done! Webhook will notify you when complete +# Your webhook handler receives the results automatically +``` + +### Crawl Job Webhooks + +```bash +curl -X POST http://localhost:11235/crawl/job \ + -H "Content-Type: application/json" \ + -d '{ + "urls": ["https://example.com"], + "browser_config": {"headless": true}, + "crawler_config": {"cache_mode": "bypass"}, + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/crawl-complete", + "webhook_data_in_payload": false, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } + }' +``` + +### LLM Extraction Job Webhooks (NEW!) + +```bash +curl -X POST http://localhost:11235/llm/job \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://example.com/article", + "q": "Extract the article title, author, and publication date", + "schema": "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"}}}", + "provider": "openai/gpt-4o-mini", + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/llm-complete", + "webhook_data_in_payload": true + } + }' +``` + +### Webhook Payload Structure + +**Success (with data):** +```json +{ + "task_id": "llm_1698765432", + "task_type": "llm_extraction", + "status": "completed", + "timestamp": "2025-10-22T10:30:00.000000+00:00", + "urls": ["https://example.com/article"], + "data": { + "extracted_content": { + "title": "Understanding Web Scraping", + "author": "John Doe", + "date": "2025-10-22" + } + } +} +``` + +**Failure:** +```json +{ + "task_id": "crawl_abc123", + "task_type": "crawl", + "status": "failed", + "timestamp": "2025-10-22T10:30:00.000000+00:00", + "urls": ["https://example.com"], + "error": "Connection timeout after 30s" +} +``` + +### Simple Webhook Handler Example + +```python +from flask import Flask, request, jsonify + +app = Flask(__name__) + +@app.route('/webhook', methods=['POST']) +def handle_webhook(): + payload = request.json + + task_id = payload['task_id'] + task_type = payload['task_type'] + status = payload['status'] + + if status == 'completed': + if 'data' in payload: + # Process data directly + data = payload['data'] + else: + # Fetch from API + endpoint = 'crawl' if task_type == 'crawl' else 'llm' + response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}') + data = response.json() + + # Your business logic here + print(f"Job {task_id} completed!") + + elif status == 'failed': + error = payload.get('error', 'Unknown error') + print(f"Job {task_id} failed: {error}") + + return jsonify({"status": "received"}), 200 + +app.run(port=8080) +``` + +## 📊 Performance Improvements + +- **Reduced Server Load**: Eliminates constant polling requests +- **Lower Latency**: Instant notification vs. polling interval delay +- **Better Resource Usage**: Frees up client connections while jobs run in background +- **Scalable Architecture**: Handles high-volume crawling workflows efficiently + +## 🐛 Bug Fixes + +- Fixed webhook configuration serialization for Pydantic HttpUrl fields +- Improved error handling in webhook delivery service +- Enhanced Redis task storage for webhook config persistence + +## 🌍 Expected Real-World Impact + +### For Web Scraping Workflows +- **Reduced Costs**: Less API calls = lower bandwidth and server costs +- **Better UX**: Instant notifications improve user experience +- **Scalability**: Handle 100s of concurrent jobs without polling overhead + +### For LLM Extraction Pipelines +- **Async Processing**: Submit LLM extraction jobs and move on +- **Batch Processing**: Queue multiple extractions, get notified as they complete +- **Integration**: Easy integration with workflow automation tools (Zapier, n8n, etc.) + +### For Microservices +- **Event-Driven**: Perfect for event-driven microservice architectures +- **Decoupling**: Decouple job submission from result processing +- **Reliability**: Automatic retries ensure webhooks are delivered + +## 🔄 Breaking Changes + +**None!** This release is fully backward compatible. + +- Webhook configuration is optional +- Existing code continues to work without modification +- Polling is still supported for jobs without webhook config + +## 📚 Documentation + +### New Documentation +- **[WEBHOOK_EXAMPLES.md](../deploy/docker/WEBHOOK_EXAMPLES.md)** - Comprehensive webhook usage guide +- **[docker_webhook_example.py](../docs/examples/docker_webhook_example.py)** - Working code examples + +### Updated Documentation +- **[Docker README](../deploy/docker/README.md)** - Added webhook sections +- API documentation with webhook examples + +## 🛠️ Migration Guide + +No migration needed! Webhooks are opt-in: + +1. **To use webhooks**: Add `webhook_config` to your job payload +2. **To keep polling**: Continue using your existing code + +### Quick Start + +```python +# Just add webhook_config to your existing payload +payload = { + # Your existing configuration + "urls": ["https://example.com"], + "browser_config": {...}, + "crawler_config": {...}, + + # NEW: Add webhook configuration + "webhook_config": { + "webhook_url": "https://myapp.com/webhook", + "webhook_data_in_payload": True + } +} +``` + +## 🔧 Configuration + +### Global Webhook Configuration (config.yml) + +```yaml +webhooks: + enabled: true + default_url: "https://myapp.com/webhooks/default" # Optional + data_in_payload: false + retry: + max_attempts: 5 + initial_delay_ms: 1000 + max_delay_ms: 32000 + timeout_ms: 30000 + headers: + User-Agent: "Crawl4AI-Webhook/1.0" +``` + +## 🚀 Upgrade Instructions + +### Docker + +```bash +# Pull the latest image +docker pull unclecode/crawl4ai:0.7.6 + +# Or use latest tag +docker pull unclecode/crawl4ai:latest + +# Run with webhook support +docker run -d \ + -p 11235:11235 \ + --env-file .llm.env \ + --name crawl4ai \ + unclecode/crawl4ai:0.7.6 +``` + +### Python Package + +```bash +pip install --upgrade crawl4ai +``` + +## 💡 Pro Tips + +1. **Use notification-only mode** for large results - fetch data separately to avoid large webhook payloads +2. **Set custom headers** for webhook authentication and request tracking +3. **Configure global default webhook** for consistent handling across all jobs +4. **Implement idempotent webhook handlers** - same webhook may be delivered multiple times on retry +5. **Use structured schemas** with LLM extraction for predictable webhook data + +## 🎬 Demo + +Try the release demo: + +```bash +python docs/releases_review/demo_v0.7.6.py +``` + +This comprehensive demo showcases: +- Crawl job webhooks (notification-only and with data) +- LLM extraction webhooks (with JSON schema support) +- Custom headers for authentication +- Webhook retry mechanism +- Real-time webhook receiver + +## 🙏 Acknowledgments + +Thank you to the community for the feedback that shaped this feature! Special thanks to everyone who requested webhook support for asynchronous job processing. + +## 📞 Support + +- **Documentation**: https://docs.crawl4ai.com +- **GitHub Issues**: https://github.com/unclecode/crawl4ai/issues +- **Discord**: https://discord.gg/crawl4ai + +--- + +**Happy crawling with webhooks!** 🕷️🪝 + +*- unclecode* diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md index cedd8e86..c955572e 100644 --- a/docs/md_v2/blog/index.md +++ b/docs/md_v2/blog/index.md @@ -20,6 +20,23 @@ Ever wondered why your AI coding assistant struggles with your library despite c ## Latest Release +### [Crawl4AI v0.7.6 – The Webhook Infrastructure Update](../blog/release-v0.7.6.md) +*October 22, 2025* + +Crawl4AI v0.7.6 introduces comprehensive webhook support for the Docker job queue API, bringing real-time notifications to both crawling and LLM extraction workflows. No more polling! + +Key highlights: +- **🪝 Complete Webhook Support**: Real-time notifications for both `/crawl/job` and `/llm/job` endpoints +- **🔄 Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s) +- **🔐 Custom Authentication**: Add custom headers for webhook authentication +- **📊 Flexible Delivery**: Choose notification-only or include full data in payload +- **⚙️ Global Configuration**: Set default webhook URL in config.yml for all jobs +- **🎯 Zero Breaking Changes**: Fully backward compatible, webhooks are opt-in + +[Read full release notes →](../blog/release-v0.7.6.md) + +## Recent Releases + ### [Crawl4AI v0.7.5 – The Docker Hooks & Security Update](../blog/release-v0.7.5.md) *September 29, 2025* diff --git a/docs/md_v2/blog/releases/0.7.6.md b/docs/md_v2/blog/releases/0.7.6.md new file mode 100644 index 00000000..e27d19cc --- /dev/null +++ b/docs/md_v2/blog/releases/0.7.6.md @@ -0,0 +1,314 @@ +# Crawl4AI v0.7.6 Release Notes + +*Release Date: October 22, 2025* + +I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows. + +## 🎯 What's New + +### Webhook Support for Docker Job Queue API + +The headline feature of v0.7.6 is comprehensive webhook support for asynchronous job processing. No more constant polling to check if your jobs are done - get instant notifications when they complete! + +**Key Capabilities:** + +- ✅ **Universal Webhook Support**: Both `/crawl/job` and `/llm/job` endpoints now support webhooks +- ✅ **Flexible Delivery Modes**: Choose notification-only or include full data in the webhook payload +- ✅ **Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s) +- ✅ **Custom Authentication**: Add custom headers for webhook authentication +- ✅ **Global Configuration**: Set default webhook URL in `config.yml` for all jobs +- ✅ **Task Type Identification**: Distinguish between `crawl` and `llm_extraction` tasks + +### How It Works + +Instead of constantly checking job status: + +**OLD WAY (Polling):** +```python +# Submit job +response = requests.post("http://localhost:11235/crawl/job", json=payload) +task_id = response.json()['task_id'] + +# Poll until complete +while True: + status = requests.get(f"http://localhost:11235/crawl/job/{task_id}") + if status.json()['status'] == 'completed': + break + time.sleep(5) # Wait and try again +``` + +**NEW WAY (Webhooks):** +```python +# Submit job with webhook +payload = { + "urls": ["https://example.com"], + "webhook_config": { + "webhook_url": "https://myapp.com/webhook", + "webhook_data_in_payload": True + } +} +response = requests.post("http://localhost:11235/crawl/job", json=payload) + +# Done! Webhook will notify you when complete +# Your webhook handler receives the results automatically +``` + +### Crawl Job Webhooks + +```bash +curl -X POST http://localhost:11235/crawl/job \ + -H "Content-Type: application/json" \ + -d '{ + "urls": ["https://example.com"], + "browser_config": {"headless": true}, + "crawler_config": {"cache_mode": "bypass"}, + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/crawl-complete", + "webhook_data_in_payload": false, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } + }' +``` + +### LLM Extraction Job Webhooks (NEW!) + +```bash +curl -X POST http://localhost:11235/llm/job \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://example.com/article", + "q": "Extract the article title, author, and publication date", + "schema": "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"}}}", + "provider": "openai/gpt-4o-mini", + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/llm-complete", + "webhook_data_in_payload": true + } + }' +``` + +### Webhook Payload Structure + +**Success (with data):** +```json +{ + "task_id": "llm_1698765432", + "task_type": "llm_extraction", + "status": "completed", + "timestamp": "2025-10-22T10:30:00.000000+00:00", + "urls": ["https://example.com/article"], + "data": { + "extracted_content": { + "title": "Understanding Web Scraping", + "author": "John Doe", + "date": "2025-10-22" + } + } +} +``` + +**Failure:** +```json +{ + "task_id": "crawl_abc123", + "task_type": "crawl", + "status": "failed", + "timestamp": "2025-10-22T10:30:00.000000+00:00", + "urls": ["https://example.com"], + "error": "Connection timeout after 30s" +} +``` + +### Simple Webhook Handler Example + +```python +from flask import Flask, request, jsonify + +app = Flask(__name__) + +@app.route('/webhook', methods=['POST']) +def handle_webhook(): + payload = request.json + + task_id = payload['task_id'] + task_type = payload['task_type'] + status = payload['status'] + + if status == 'completed': + if 'data' in payload: + # Process data directly + data = payload['data'] + else: + # Fetch from API + endpoint = 'crawl' if task_type == 'crawl' else 'llm' + response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}') + data = response.json() + + # Your business logic here + print(f"Job {task_id} completed!") + + elif status == 'failed': + error = payload.get('error', 'Unknown error') + print(f"Job {task_id} failed: {error}") + + return jsonify({"status": "received"}), 200 + +app.run(port=8080) +``` + +## 📊 Performance Improvements + +- **Reduced Server Load**: Eliminates constant polling requests +- **Lower Latency**: Instant notification vs. polling interval delay +- **Better Resource Usage**: Frees up client connections while jobs run in background +- **Scalable Architecture**: Handles high-volume crawling workflows efficiently + +## 🐛 Bug Fixes + +- Fixed webhook configuration serialization for Pydantic HttpUrl fields +- Improved error handling in webhook delivery service +- Enhanced Redis task storage for webhook config persistence + +## 🌍 Expected Real-World Impact + +### For Web Scraping Workflows +- **Reduced Costs**: Less API calls = lower bandwidth and server costs +- **Better UX**: Instant notifications improve user experience +- **Scalability**: Handle 100s of concurrent jobs without polling overhead + +### For LLM Extraction Pipelines +- **Async Processing**: Submit LLM extraction jobs and move on +- **Batch Processing**: Queue multiple extractions, get notified as they complete +- **Integration**: Easy integration with workflow automation tools (Zapier, n8n, etc.) + +### For Microservices +- **Event-Driven**: Perfect for event-driven microservice architectures +- **Decoupling**: Decouple job submission from result processing +- **Reliability**: Automatic retries ensure webhooks are delivered + +## 🔄 Breaking Changes + +**None!** This release is fully backward compatible. + +- Webhook configuration is optional +- Existing code continues to work without modification +- Polling is still supported for jobs without webhook config + +## 📚 Documentation + +### New Documentation +- **[WEBHOOK_EXAMPLES.md](../deploy/docker/WEBHOOK_EXAMPLES.md)** - Comprehensive webhook usage guide +- **[docker_webhook_example.py](../docs/examples/docker_webhook_example.py)** - Working code examples + +### Updated Documentation +- **[Docker README](../deploy/docker/README.md)** - Added webhook sections +- API documentation with webhook examples + +## 🛠️ Migration Guide + +No migration needed! Webhooks are opt-in: + +1. **To use webhooks**: Add `webhook_config` to your job payload +2. **To keep polling**: Continue using your existing code + +### Quick Start + +```python +# Just add webhook_config to your existing payload +payload = { + # Your existing configuration + "urls": ["https://example.com"], + "browser_config": {...}, + "crawler_config": {...}, + + # NEW: Add webhook configuration + "webhook_config": { + "webhook_url": "https://myapp.com/webhook", + "webhook_data_in_payload": True + } +} +``` + +## 🔧 Configuration + +### Global Webhook Configuration (config.yml) + +```yaml +webhooks: + enabled: true + default_url: "https://myapp.com/webhooks/default" # Optional + data_in_payload: false + retry: + max_attempts: 5 + initial_delay_ms: 1000 + max_delay_ms: 32000 + timeout_ms: 30000 + headers: + User-Agent: "Crawl4AI-Webhook/1.0" +``` + +## 🚀 Upgrade Instructions + +### Docker + +```bash +# Pull the latest image +docker pull unclecode/crawl4ai:0.7.6 + +# Or use latest tag +docker pull unclecode/crawl4ai:latest + +# Run with webhook support +docker run -d \ + -p 11235:11235 \ + --env-file .llm.env \ + --name crawl4ai \ + unclecode/crawl4ai:0.7.6 +``` + +### Python Package + +```bash +pip install --upgrade crawl4ai +``` + +## 💡 Pro Tips + +1. **Use notification-only mode** for large results - fetch data separately to avoid large webhook payloads +2. **Set custom headers** for webhook authentication and request tracking +3. **Configure global default webhook** for consistent handling across all jobs +4. **Implement idempotent webhook handlers** - same webhook may be delivered multiple times on retry +5. **Use structured schemas** with LLM extraction for predictable webhook data + +## 🎬 Demo + +Try the release demo: + +```bash +python docs/releases_review/demo_v0.7.6.py +``` + +This comprehensive demo showcases: +- Crawl job webhooks (notification-only and with data) +- LLM extraction webhooks (with JSON schema support) +- Custom headers for authentication +- Webhook retry mechanism +- Real-time webhook receiver + +## 🙏 Acknowledgments + +Thank you to the community for the feedback that shaped this feature! Special thanks to everyone who requested webhook support for asynchronous job processing. + +## 📞 Support + +- **Documentation**: https://docs.crawl4ai.com +- **GitHub Issues**: https://github.com/unclecode/crawl4ai/issues +- **Discord**: https://discord.gg/crawl4ai + +--- + +**Happy crawling with webhooks!** 🕷️🪝 + +*- unclecode* diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md index 36bf28e1..ec2c69a5 100644 --- a/docs/md_v2/core/docker-deployment.md +++ b/docs/md_v2/core/docker-deployment.md @@ -65,13 +65,13 @@ Pull and run images directly from Docker Hub without building locally. #### 1. Pull the Image -Our latest release is `0.7.3`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system. +Our latest release is `0.7.6`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system. -> 💡 **Note**: The `latest` tag points to the stable `0.7.3` version. +> 💡 **Note**: The `latest` tag points to the stable `0.7.6` version. ```bash # Pull the latest version -docker pull unclecode/crawl4ai:0.7.3 +docker pull unclecode/crawl4ai:0.7.6 # Or pull using the latest tag docker pull unclecode/crawl4ai:latest @@ -143,7 +143,7 @@ docker stop crawl4ai && docker rm crawl4ai #### Docker Hub Versioning Explained * **Image Name:** `unclecode/crawl4ai` -* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.7.3`) +* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.7.6`) * `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library * `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`) * **`latest` Tag:** Points to the most recent stable version diff --git a/docs/releases_review/demo_v0.7.6.py b/docs/releases_review/demo_v0.7.6.py new file mode 100644 index 00000000..5d59adff --- /dev/null +++ b/docs/releases_review/demo_v0.7.6.py @@ -0,0 +1,359 @@ +#!/usr/bin/env python3 +""" +Crawl4AI v0.7.6 Release Demo +============================ + +This demo showcases the major feature in v0.7.6: +**Webhook Support for Docker Job Queue API** + +Features Demonstrated: +1. Asynchronous job processing with webhook notifications +2. Webhook support for /crawl/job endpoint +3. Webhook support for /llm/job endpoint +4. Notification-only vs data-in-payload modes +5. Custom webhook headers for authentication +6. Structured extraction with JSON schemas +7. Exponential backoff retry for reliable delivery + +Prerequisites: +- Crawl4AI Docker container running on localhost:11235 +- Flask installed: pip install flask requests +- LLM API key configured (for LLM examples) + +Usage: +python docs/releases_review/demo_v0.7.6.py +""" + +import requests +import json +import time +from flask import Flask, request, jsonify +from threading import Thread + +# Configuration +CRAWL4AI_BASE_URL = "http://localhost:11235" +WEBHOOK_BASE_URL = "http://localhost:8080" + +# Flask app for webhook receiver +app = Flask(__name__) +received_webhooks = [] + + +@app.route('/webhook', methods=['POST']) +def webhook_handler(): + """Universal webhook handler for both crawl and LLM extraction jobs.""" + payload = request.json + task_id = payload['task_id'] + task_type = payload['task_type'] + status = payload['status'] + + print(f"\n{'='*70}") + print(f"📬 Webhook Received!") + print(f" Task ID: {task_id}") + print(f" Task Type: {task_type}") + print(f" Status: {status}") + print(f" Timestamp: {payload['timestamp']}") + + if status == 'completed': + if 'data' in payload: + print(f" ✅ Data included in webhook") + if task_type == 'crawl': + results = payload['data'].get('results', []) + print(f" 📊 Crawled {len(results)} URL(s)") + elif task_type == 'llm_extraction': + extracted = payload['data'].get('extracted_content', {}) + print(f" 🤖 Extracted: {json.dumps(extracted, indent=6)}") + else: + print(f" 📥 Notification only (fetch data separately)") + elif status == 'failed': + print(f" ❌ Error: {payload.get('error', 'Unknown')}") + + print(f"{'='*70}\n") + received_webhooks.append(payload) + + return jsonify({"status": "received"}), 200 + + +def start_webhook_server(): + """Start Flask webhook server in background.""" + app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False) + + +def demo_1_crawl_webhook_notification_only(): + """Demo 1: Crawl job with webhook notification (data fetched separately).""" + print("\n" + "="*70) + print("DEMO 1: Crawl Job - Webhook Notification Only") + print("="*70) + print("Submitting crawl job with webhook notification...") + + payload = { + "urls": ["https://example.com"], + "browser_config": {"headless": True}, + "crawler_config": {"cache_mode": "bypass"}, + "webhook_config": { + "webhook_url": f"{WEBHOOK_BASE_URL}/webhook", + "webhook_data_in_payload": False, + "webhook_headers": { + "X-Demo": "v0.7.6", + "X-Type": "crawl" + } + } + } + + response = requests.post(f"{CRAWL4AI_BASE_URL}/crawl/job", json=payload) + if response.ok: + task_id = response.json()['task_id'] + print(f"✅ Job submitted: {task_id}") + print("⏳ Webhook will notify when complete...") + return task_id + else: + print(f"❌ Failed: {response.text}") + return None + + +def demo_2_crawl_webhook_with_data(): + """Demo 2: Crawl job with full data in webhook payload.""" + print("\n" + "="*70) + print("DEMO 2: Crawl Job - Webhook with Full Data") + print("="*70) + print("Submitting crawl job with data included in webhook...") + + payload = { + "urls": ["https://www.python.org"], + "browser_config": {"headless": True}, + "crawler_config": {"cache_mode": "bypass"}, + "webhook_config": { + "webhook_url": f"{WEBHOOK_BASE_URL}/webhook", + "webhook_data_in_payload": True, + "webhook_headers": { + "X-Demo": "v0.7.6", + "X-Type": "crawl-with-data" + } + } + } + + response = requests.post(f"{CRAWL4AI_BASE_URL}/crawl/job", json=payload) + if response.ok: + task_id = response.json()['task_id'] + print(f"✅ Job submitted: {task_id}") + print("⏳ Webhook will include full results...") + return task_id + else: + print(f"❌ Failed: {response.text}") + return None + + +def demo_3_llm_webhook_notification_only(): + """Demo 3: LLM extraction with webhook notification (NEW in v0.7.6!).""" + print("\n" + "="*70) + print("DEMO 3: LLM Extraction - Webhook Notification Only (NEW!)") + print("="*70) + print("Submitting LLM extraction job with webhook notification...") + + payload = { + "url": "https://www.example.com", + "q": "Extract the main heading and description from this page", + "provider": "openai/gpt-4o-mini", + "cache": False, + "webhook_config": { + "webhook_url": f"{WEBHOOK_BASE_URL}/webhook", + "webhook_data_in_payload": False, + "webhook_headers": { + "X-Demo": "v0.7.6", + "X-Type": "llm" + } + } + } + + response = requests.post(f"{CRAWL4AI_BASE_URL}/llm/job", json=payload) + if response.ok: + task_id = response.json()['task_id'] + print(f"✅ Job submitted: {task_id}") + print("⏳ Webhook will notify when LLM extraction completes...") + return task_id + else: + print(f"❌ Failed: {response.text}") + return None + + +def demo_4_llm_webhook_with_schema(): + """Demo 4: LLM extraction with JSON schema and data in webhook (NEW in v0.7.6!).""" + print("\n" + "="*70) + print("DEMO 4: LLM Extraction - Schema + Full Data in Webhook (NEW!)") + print("="*70) + print("Submitting LLM extraction with JSON schema...") + + schema = { + "type": "object", + "properties": { + "title": {"type": "string", "description": "Page title"}, + "description": {"type": "string", "description": "Page description"}, + "main_topics": { + "type": "array", + "items": {"type": "string"}, + "description": "Main topics covered" + } + }, + "required": ["title"] + } + + payload = { + "url": "https://www.python.org", + "q": "Extract the title, description, and main topics from this website", + "schema": json.dumps(schema), + "provider": "openai/gpt-4o-mini", + "cache": False, + "webhook_config": { + "webhook_url": f"{WEBHOOK_BASE_URL}/webhook", + "webhook_data_in_payload": True, + "webhook_headers": { + "X-Demo": "v0.7.6", + "X-Type": "llm-with-schema" + } + } + } + + response = requests.post(f"{CRAWL4AI_BASE_URL}/llm/job", json=payload) + if response.ok: + task_id = response.json()['task_id'] + print(f"✅ Job submitted: {task_id}") + print("⏳ Webhook will include structured extraction results...") + return task_id + else: + print(f"❌ Failed: {response.text}") + return None + + +def demo_5_global_webhook_config(): + """Demo 5: Using global webhook configuration from config.yml.""" + print("\n" + "="*70) + print("DEMO 5: Global Webhook Configuration") + print("="*70) + print("💡 You can configure a default webhook URL in config.yml:") + print(""" + webhooks: + enabled: true + default_url: "https://myapp.com/webhooks/default" + data_in_payload: false + retry: + max_attempts: 5 + initial_delay_ms: 1000 + max_delay_ms: 32000 + timeout_ms: 30000 + """) + print("Then submit jobs WITHOUT webhook_config - they'll use the default!") + print("This is useful for consistent webhook handling across all jobs.") + + +def demo_6_webhook_retry_logic(): + """Demo 6: Webhook retry mechanism with exponential backoff.""" + print("\n" + "="*70) + print("DEMO 6: Webhook Retry Logic") + print("="*70) + print("🔄 Webhook delivery uses exponential backoff retry:") + print(" • Max attempts: 5") + print(" • Delays: 1s → 2s → 4s → 8s → 16s") + print(" • Timeout: 30s per attempt") + print(" • Retries on: 5xx errors, network errors, timeouts") + print(" • No retry on: 4xx client errors") + print("\nThis ensures reliable webhook delivery even with temporary failures!") + + +def print_summary(): + """Print demo summary and results.""" + print("\n" + "="*70) + print("📊 DEMO SUMMARY") + print("="*70) + print(f"Total webhooks received: {len(received_webhooks)}") + + crawl_webhooks = [w for w in received_webhooks if w['task_type'] == 'crawl'] + llm_webhooks = [w for w in received_webhooks if w['task_type'] == 'llm_extraction'] + + print(f"\nBreakdown:") + print(f" 🕷️ Crawl jobs: {len(crawl_webhooks)}") + print(f" 🤖 LLM extraction jobs: {len(llm_webhooks)}") + + print(f"\nDetails:") + for i, webhook in enumerate(received_webhooks, 1): + icon = "🕷️" if webhook['task_type'] == 'crawl' else "🤖" + print(f" {i}. {icon} {webhook['task_id']}: {webhook['status']}") + + print("\n" + "="*70) + print("✨ v0.7.6 KEY FEATURES DEMONSTRATED:") + print("="*70) + print("✅ Webhook support for /crawl/job") + print("✅ Webhook support for /llm/job (NEW!)") + print("✅ Notification-only mode (fetch data separately)") + print("✅ Data-in-payload mode (get full results in webhook)") + print("✅ Custom headers for authentication") + print("✅ JSON schema for structured LLM extraction") + print("✅ Exponential backoff retry for reliable delivery") + print("✅ Global webhook configuration support") + print("✅ Universal webhook handler for both job types") + print("\n💡 Benefits:") + print(" • No more polling - get instant notifications") + print(" • Better resource utilization") + print(" • Reliable delivery with automatic retries") + print(" • Consistent API across crawl and LLM jobs") + print(" • Production-ready webhook infrastructure") + + +def main(): + """Run all demos.""" + print("\n" + "="*70) + print("🚀 Crawl4AI v0.7.6 Release Demo") + print("="*70) + print("Feature: Webhook Support for Docker Job Queue API") + print("="*70) + + # Check if server is running + try: + health = requests.get(f"{CRAWL4AI_BASE_URL}/health", timeout=5) + print(f"✅ Crawl4AI server is running") + except: + print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}") + print("Please start Docker container:") + print(" docker run -d -p 11235:11235 --env-file .llm.env unclecode/crawl4ai:0.7.6") + return + + # Start webhook server + print(f"\n🌐 Starting webhook server at {WEBHOOK_BASE_URL}...") + webhook_thread = Thread(target=start_webhook_server, daemon=True) + webhook_thread.start() + time.sleep(2) + + # Run demos + demo_1_crawl_webhook_notification_only() + time.sleep(5) + + demo_2_crawl_webhook_with_data() + time.sleep(5) + + demo_3_llm_webhook_notification_only() + time.sleep(5) + + demo_4_llm_webhook_with_schema() + time.sleep(5) + + demo_5_global_webhook_config() + demo_6_webhook_retry_logic() + + # Wait for webhooks + print("\n⏳ Waiting for all webhooks to arrive...") + time.sleep(30) + + # Print summary + print_summary() + + print("\n" + "="*70) + print("✅ Demo completed!") + print("="*70) + print("\n📚 Documentation:") + print(" • deploy/docker/WEBHOOK_EXAMPLES.md") + print(" • docs/examples/docker_webhook_example.py") + print("\n🔗 Upgrade:") + print(" docker pull unclecode/crawl4ai:0.7.6") + + +if __name__ == "__main__": + main()