Merge branch 'develop' into feature/docker-cluster
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -267,6 +267,8 @@ continue_config.json
|
|||||||
.llm.env
|
.llm.env
|
||||||
.private/
|
.private/
|
||||||
|
|
||||||
|
.claude/
|
||||||
|
|
||||||
CLAUDE_MONITOR.md
|
CLAUDE_MONITOR.md
|
||||||
CLAUDE.md
|
CLAUDE.md
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
FROM python:3.12-slim-bookworm AS build
|
FROM python:3.12-slim-bookworm AS build
|
||||||
|
|
||||||
# C4ai version
|
# C4ai version
|
||||||
ARG C4AI_VER=0.7.0-r1
|
ARG C4AI_VER=0.7.6
|
||||||
ENV C4AI_VERSION=$C4AI_VER
|
ENV C4AI_VERSION=$C4AI_VER
|
||||||
LABEL c4ai.version=$C4AI_VER
|
LABEL c4ai.version=$C4AI_VER
|
||||||
|
|
||||||
|
|||||||
88
README.md
88
README.md
@@ -27,11 +27,13 @@
|
|||||||
|
|
||||||
Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community.
|
Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community.
|
||||||
|
|
||||||
[✨ Check out latest update v0.7.4](#-recent-updates)
|
[✨ Check out latest update v0.7.6](#-recent-updates)
|
||||||
|
|
||||||
✨ New in v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
|
✨ **New in v0.7.6**: Complete Webhook Infrastructure for Docker Job Queue API! Real-time notifications for both `/crawl/job` and `/llm/job` endpoints with exponential backoff retry, custom headers, and flexible delivery modes. No more polling! [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.6.md)
|
||||||
|
|
||||||
✨ Recent v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md)
|
✨ Recent v0.7.5: Docker Hooks System with function-based API for pipeline customization, Enhanced LLM Integration with custom providers, HTTPS Preservation, and multiple community-reported bug fixes. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md)
|
||||||
|
|
||||||
|
✨ Previous v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||||
@@ -177,7 +179,7 @@ No rate-limited APIs. No lock-in. Build and own your data pipeline with direct g
|
|||||||
- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis.
|
- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis.
|
||||||
- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`).
|
- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`).
|
||||||
- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content.
|
- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content.
|
||||||
- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior.
|
- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior (supports both string and function-based APIs).
|
||||||
- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches.
|
- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches.
|
||||||
- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages.
|
- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages.
|
||||||
- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
|
- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
|
||||||
@@ -544,6 +546,54 @@ async def test_news_crawl():
|
|||||||
|
|
||||||
## ✨ Recent Updates
|
## ✨ Recent Updates
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><strong>Version 0.7.5 Release Highlights - The Docker Hooks & Security Update</strong></summary>
|
||||||
|
|
||||||
|
- **🔧 Docker Hooks System**: Complete pipeline customization with user-provided Python functions at 8 key points
|
||||||
|
- **✨ Function-Based Hooks API (NEW)**: Write hooks as regular Python functions with full IDE support:
|
||||||
|
```python
|
||||||
|
from crawl4ai import hooks_to_string
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
# Define hooks as regular Python functions
|
||||||
|
async def on_page_context_created(page, context, **kwargs):
|
||||||
|
"""Block images to speed up crawling"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_goto(page, context, url, **kwargs):
|
||||||
|
"""Add custom headers"""
|
||||||
|
await page.set_extra_http_headers({'X-Crawl4AI': 'v0.7.5'})
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Option 1: Use hooks_to_string() utility for REST API
|
||||||
|
hooks_code = hooks_to_string({
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_goto": before_goto
|
||||||
|
})
|
||||||
|
|
||||||
|
# Option 2: Docker client with automatic conversion (Recommended)
|
||||||
|
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=["https://httpbin.org/html"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_goto": before_goto
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# ✓ Full IDE support, type checking, and reusability!
|
||||||
|
```
|
||||||
|
|
||||||
|
- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration
|
||||||
|
- **🔒 HTTPS Preservation**: Secure internal link handling with `preserve_https_for_internal_links=True`
|
||||||
|
- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance
|
||||||
|
- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration
|
||||||
|
|
||||||
|
[Full v0.7.5 Release Notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md)
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>Version 0.7.4 Release Highlights - The Intelligent Table Extraction & Performance Update</strong></summary>
|
<summary><strong>Version 0.7.4 Release Highlights - The Intelligent Table Extraction & Performance Update</strong></summary>
|
||||||
|
|
||||||
@@ -919,6 +969,36 @@ We envision a future where AI is powered by real human knowledge, ensuring data
|
|||||||
For more details, see our [full mission statement](./MISSION.md).
|
For more details, see our [full mission statement](./MISSION.md).
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## 🌟 Current Sponsors
|
||||||
|
|
||||||
|
### 🏢 Enterprise Sponsors & Partners
|
||||||
|
|
||||||
|
Our enterprise sponsors and technology partners help scale Crawl4AI to power production-grade data pipelines.
|
||||||
|
|
||||||
|
| Company | About | Sponsorship Tier |
|
||||||
|
|------|------|----------------------------|
|
||||||
|
| <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥈 Silver |
|
||||||
|
| <a href="https://kipo.ai" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045751_2d54f57f117c651e.png" alt="DataSync" width="120"/></a> | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold |
|
||||||
|
| <a href="https://www.kidocode.com/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045045_bb8dace3f0440d65.svg" alt="Kidocode" width="120"/><p align="center">KidoCode</p></a> | Kidocode is a hybrid technology and entrepreneurship school for kids aged 5–18, offering both online and on-campus education. | 🥇 Gold |
|
||||||
|
| <a href="https://www.alephnull.sg/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013050323_a9e8e8c4c3650421.svg" alt="Aleph null" width="120"/></a> | Singapore-based Aleph Null is Asia’s leading edtech hub, dedicated to student-centric, AI-driven education—empowering learners with the tools to thrive in a fast-changing world. | 🥇 Gold |
|
||||||
|
|
||||||
|
### 🧑🤝 Individual Sponsors
|
||||||
|
|
||||||
|
A heartfelt thanks to our individual supporters! Every contribution helps us keep our opensource mission alive and thriving!
|
||||||
|
|
||||||
|
<p align="left">
|
||||||
|
<a href="https://github.com/hafezparast"><img src="https://avatars.githubusercontent.com/u/14273305?s=60&v=4" style="border-radius:50%;" width="64px;"/></a>
|
||||||
|
<a href="https://github.com/ntohidi"><img src="https://avatars.githubusercontent.com/u/17140097?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/Sjoeborg"><img src="https://avatars.githubusercontent.com/u/17451310?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/romek-rozen"><img src="https://avatars.githubusercontent.com/u/30595969?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/Kourosh-Kiyani"><img src="https://avatars.githubusercontent.com/u/34105600?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/Etherdrake"><img src="https://avatars.githubusercontent.com/u/67021215?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/shaman247"><img src="https://avatars.githubusercontent.com/u/211010067?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/work-flow-manager"><img src="https://avatars.githubusercontent.com/u/217665461?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
> Want to join them? [Sponsor Crawl4AI →](https://github.com/sponsors/unclecode)
|
||||||
|
|
||||||
## Star History
|
## Star History
|
||||||
|
|
||||||
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
||||||
|
|||||||
@@ -103,7 +103,8 @@ from .browser_adapter import (
|
|||||||
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
start_colab_display_server,
|
start_colab_display_server,
|
||||||
setup_colab_environment
|
setup_colab_environment,
|
||||||
|
hooks_to_string
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -183,6 +184,7 @@ __all__ = [
|
|||||||
"ProxyConfig",
|
"ProxyConfig",
|
||||||
"start_colab_display_server",
|
"start_colab_display_server",
|
||||||
"setup_colab_environment",
|
"setup_colab_environment",
|
||||||
|
"hooks_to_string",
|
||||||
# C4A Script additions
|
# C4A Script additions
|
||||||
"c4a_compile",
|
"c4a_compile",
|
||||||
"c4a_validate",
|
"c4a_validate",
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# crawl4ai/__version__.py
|
# crawl4ai/__version__.py
|
||||||
|
|
||||||
# This is the version that will be used for stable releases
|
# This is the version that will be used for stable releases
|
||||||
__version__ = "0.7.4"
|
__version__ = "0.7.6"
|
||||||
|
|
||||||
# For nightly builds, this gets set during build process
|
# For nightly builds, this gets set during build process
|
||||||
__nightly_version__ = None
|
__nightly_version__ = None
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional, Union, AsyncGenerator, Dict, Any
|
from typing import List, Optional, Union, AsyncGenerator, Dict, Any, Callable
|
||||||
import httpx
|
import httpx
|
||||||
import json
|
import json
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
@@ -7,6 +7,7 @@ import asyncio
|
|||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
from .async_logger import AsyncLogger, LogLevel
|
from .async_logger import AsyncLogger, LogLevel
|
||||||
|
from .utils import hooks_to_string
|
||||||
|
|
||||||
|
|
||||||
class Crawl4aiClientError(Exception):
|
class Crawl4aiClientError(Exception):
|
||||||
@@ -70,17 +71,41 @@ class Crawl4aiDockerClient:
|
|||||||
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
|
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
|
||||||
raise ConnectionError(f"Cannot connect to server: {str(e)}")
|
raise ConnectionError(f"Cannot connect to server: {str(e)}")
|
||||||
|
|
||||||
def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
|
def _prepare_request(
|
||||||
crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
browser_config: Optional[BrowserConfig] = None,
|
||||||
|
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||||
|
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
||||||
|
hooks_timeout: int = 30
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Prepare request data from configs."""
|
"""Prepare request data from configs."""
|
||||||
if self._token:
|
if self._token:
|
||||||
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
||||||
return {
|
|
||||||
|
request_data = {
|
||||||
"urls": urls,
|
"urls": urls,
|
||||||
"browser_config": browser_config.dump() if browser_config else {},
|
"browser_config": browser_config.dump() if browser_config else {},
|
||||||
"crawler_config": crawler_config.dump() if crawler_config else {}
|
"crawler_config": crawler_config.dump() if crawler_config else {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Handle hooks if provided
|
||||||
|
if hooks:
|
||||||
|
# Check if hooks are already strings or need conversion
|
||||||
|
if any(callable(v) for v in hooks.values()):
|
||||||
|
# Convert function objects to strings
|
||||||
|
hooks_code = hooks_to_string(hooks)
|
||||||
|
else:
|
||||||
|
# Already in string format
|
||||||
|
hooks_code = hooks
|
||||||
|
|
||||||
|
request_data["hooks"] = {
|
||||||
|
"code": hooks_code,
|
||||||
|
"timeout": hooks_timeout
|
||||||
|
}
|
||||||
|
|
||||||
|
return request_data
|
||||||
|
|
||||||
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
|
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
|
||||||
"""Make an HTTP request with error handling."""
|
"""Make an HTTP request with error handling."""
|
||||||
url = urljoin(self.base_url, endpoint)
|
url = urljoin(self.base_url, endpoint)
|
||||||
@@ -102,16 +127,42 @@ class Crawl4aiDockerClient:
|
|||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
browser_config: Optional[BrowserConfig] = None,
|
browser_config: Optional[BrowserConfig] = None,
|
||||||
crawler_config: Optional[CrawlerRunConfig] = None
|
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||||
|
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
||||||
|
hooks_timeout: int = 30
|
||||||
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||||
"""Execute a crawl operation."""
|
"""
|
||||||
|
Execute a crawl operation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: List of URLs to crawl
|
||||||
|
browser_config: Browser configuration
|
||||||
|
crawler_config: Crawler configuration
|
||||||
|
hooks: Optional hooks - can be either:
|
||||||
|
- Dict[str, Callable]: Function objects that will be converted to strings
|
||||||
|
- Dict[str, str]: Already stringified hook code
|
||||||
|
hooks_timeout: Timeout in seconds for each hook execution (1-120)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Single CrawlResult, list of results, or async generator for streaming
|
||||||
|
|
||||||
|
Example with function hooks:
|
||||||
|
>>> async def my_hook(page, context, **kwargs):
|
||||||
|
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
... return page
|
||||||
|
>>>
|
||||||
|
>>> result = await client.crawl(
|
||||||
|
... ["https://example.com"],
|
||||||
|
... hooks={"on_page_context_created": my_hook}
|
||||||
|
... )
|
||||||
|
"""
|
||||||
await self._check_server()
|
await self._check_server()
|
||||||
|
|
||||||
data = self._prepare_request(urls, browser_config, crawler_config)
|
data = self._prepare_request(urls, browser_config, crawler_config, hooks, hooks_timeout)
|
||||||
is_streaming = crawler_config and crawler_config.stream
|
is_streaming = crawler_config and crawler_config.stream
|
||||||
|
|
||||||
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
|
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
|
||||||
|
|
||||||
if is_streaming:
|
if is_streaming:
|
||||||
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
|
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
|
||||||
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
|
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
|
||||||
@@ -128,12 +179,12 @@ class Crawl4aiDockerClient:
|
|||||||
else:
|
else:
|
||||||
yield CrawlResult(**result)
|
yield CrawlResult(**result)
|
||||||
return stream_results()
|
return stream_results()
|
||||||
|
|
||||||
response = await self._request("POST", "/crawl", json=data)
|
response = await self._request("POST", "/crawl", json=data)
|
||||||
result_data = response.json()
|
result_data = response.json()
|
||||||
if not result_data.get("success", False):
|
if not result_data.get("success", False):
|
||||||
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
|
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
|
||||||
|
|
||||||
results = [CrawlResult(**r) for r in result_data.get("results", [])]
|
results = [CrawlResult(**r) for r in result_data.get("results", [])]
|
||||||
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
|
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
|
||||||
return results[0] if len(results) == 1 else results
|
return results[0] if len(results) == 1 else results
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ from urllib.parse import (
|
|||||||
urljoin, urlparse, urlunparse,
|
urljoin, urlparse, urlunparse,
|
||||||
parse_qsl, urlencode, quote, unquote
|
parse_qsl, urlencode, quote, unquote
|
||||||
)
|
)
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
|
||||||
# Monkey patch to fix wildcard handling in urllib.robotparser
|
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||||
@@ -3529,4 +3530,52 @@ def get_memory_stats() -> Tuple[float, float, float]:
|
|||||||
available_gb = get_true_available_memory_gb()
|
available_gb = get_true_available_memory_gb()
|
||||||
used_percent = get_true_memory_usage_percent()
|
used_percent = get_true_memory_usage_percent()
|
||||||
|
|
||||||
return used_percent, available_gb, total_gb
|
return used_percent, available_gb, total_gb
|
||||||
|
|
||||||
|
|
||||||
|
# Hook utilities for Docker API
|
||||||
|
def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Convert hook function objects to string representations for Docker API.
|
||||||
|
|
||||||
|
This utility simplifies the process of using hooks with the Docker API by converting
|
||||||
|
Python function objects into the string format required by the API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hooks: Dictionary mapping hook point names to Python function objects.
|
||||||
|
Functions should be async and follow hook signature requirements.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping hook point names to string representations of the functions.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> async def my_hook(page, context, **kwargs):
|
||||||
|
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
... return page
|
||||||
|
>>>
|
||||||
|
>>> hooks_dict = {"on_page_context_created": my_hook}
|
||||||
|
>>> api_hooks = hooks_to_string(hooks_dict)
|
||||||
|
>>> # api_hooks is now ready to use with Docker API
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If a hook is not callable or source cannot be extracted
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
for hook_name, hook_func in hooks.items():
|
||||||
|
if not callable(hook_func):
|
||||||
|
raise ValueError(f"Hook '{hook_name}' must be a callable function, got {type(hook_func)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get the source code of the function
|
||||||
|
source = inspect.getsource(hook_func)
|
||||||
|
# Remove any leading indentation to get clean source
|
||||||
|
source = textwrap.dedent(source)
|
||||||
|
result[hook_name] = source
|
||||||
|
except (OSError, TypeError) as e:
|
||||||
|
raise ValueError(
|
||||||
|
f"Cannot extract source code for hook '{hook_name}'. "
|
||||||
|
f"Make sure the function is defined in a file (not interactively). Error: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
- [Python SDK](#python-sdk)
|
- [Python SDK](#python-sdk)
|
||||||
- [Understanding Request Schema](#understanding-request-schema)
|
- [Understanding Request Schema](#understanding-request-schema)
|
||||||
- [REST API Examples](#rest-api-examples)
|
- [REST API Examples](#rest-api-examples)
|
||||||
|
- [Asynchronous Jobs with Webhooks](#asynchronous-jobs-with-webhooks)
|
||||||
- [Additional API Endpoints](#additional-api-endpoints)
|
- [Additional API Endpoints](#additional-api-endpoints)
|
||||||
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
||||||
- [Screenshot Endpoint](#screenshot-endpoint)
|
- [Screenshot Endpoint](#screenshot-endpoint)
|
||||||
@@ -58,15 +59,13 @@ Pull and run images directly from Docker Hub without building locally.
|
|||||||
|
|
||||||
#### 1. Pull the Image
|
#### 1. Pull the Image
|
||||||
|
|
||||||
Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
Our latest stable release is `0.7.6`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||||
|
|
||||||
> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features.
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Pull the release candidate (for testing new features)
|
# Pull the latest stable version (0.7.6)
|
||||||
docker pull unclecode/crawl4ai:0.7.0-r1
|
docker pull unclecode/crawl4ai:0.7.6
|
||||||
|
|
||||||
# Or pull the current stable version (0.6.0)
|
# Or use the latest tag (points to 0.7.6)
|
||||||
docker pull unclecode/crawl4ai:latest
|
docker pull unclecode/crawl4ai:latest
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -101,7 +100,7 @@ EOL
|
|||||||
-p 11235:11235 \
|
-p 11235:11235 \
|
||||||
--name crawl4ai \
|
--name crawl4ai \
|
||||||
--shm-size=1g \
|
--shm-size=1g \
|
||||||
unclecode/crawl4ai:0.7.0-r1
|
unclecode/crawl4ai:0.7.6
|
||||||
```
|
```
|
||||||
|
|
||||||
* **With LLM support:**
|
* **With LLM support:**
|
||||||
@@ -112,7 +111,7 @@ EOL
|
|||||||
--name crawl4ai \
|
--name crawl4ai \
|
||||||
--env-file .llm.env \
|
--env-file .llm.env \
|
||||||
--shm-size=1g \
|
--shm-size=1g \
|
||||||
unclecode/crawl4ai:0.7.0-r1
|
unclecode/crawl4ai:0.7.6
|
||||||
```
|
```
|
||||||
|
|
||||||
> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
|
> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
|
||||||
@@ -185,7 +184,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach
|
|||||||
```bash
|
```bash
|
||||||
# Pulls and runs the release candidate from Docker Hub
|
# Pulls and runs the release candidate from Docker Hub
|
||||||
# Automatically selects the correct architecture
|
# Automatically selects the correct architecture
|
||||||
IMAGE=unclecode/crawl4ai:0.7.0-r1 docker compose up -d
|
IMAGE=unclecode/crawl4ai:0.7.6 docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
* **Build and Run Locally:**
|
* **Build and Run Locally:**
|
||||||
@@ -648,6 +647,194 @@ async def test_stream_crawl(token: str = None): # Made token optional
|
|||||||
# asyncio.run(test_stream_crawl())
|
# asyncio.run(test_stream_crawl())
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Asynchronous Jobs with Webhooks
|
||||||
|
|
||||||
|
For long-running crawls or when you want to avoid keeping connections open, use the job queue endpoints. Instead of polling for results, configure a webhook to receive notifications when jobs complete.
|
||||||
|
|
||||||
|
#### Why Use Jobs & Webhooks?
|
||||||
|
|
||||||
|
- **No Polling Required** - Get notified when crawls complete instead of constantly checking status
|
||||||
|
- **Better Resource Usage** - Free up client connections while jobs run in the background
|
||||||
|
- **Scalable Architecture** - Ideal for high-volume crawling with TypeScript/Node.js clients or microservices
|
||||||
|
- **Reliable Delivery** - Automatic retry with exponential backoff (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
||||||
|
|
||||||
|
#### How It Works
|
||||||
|
|
||||||
|
1. **Submit Job** → POST to `/crawl/job` with optional `webhook_config`
|
||||||
|
2. **Get Task ID** → Receive a `task_id` immediately
|
||||||
|
3. **Job Runs** → Crawl executes in the background
|
||||||
|
4. **Webhook Fired** → Server POSTs completion notification to your webhook URL
|
||||||
|
5. **Fetch Results** → If data wasn't included in webhook, GET `/crawl/job/{task_id}`
|
||||||
|
|
||||||
|
#### Quick Example
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Submit a crawl job with webhook notification
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": false
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
|
||||||
|
# Response: {"task_id": "crawl_a1b2c3d4"}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Your webhook receives:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Then fetch the results:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:11235/crawl/job/crawl_a1b2c3d4
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Include Data in Webhook
|
||||||
|
|
||||||
|
Set `webhook_data_in_payload: true` to receive the full crawl results directly in the webhook:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": true
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Your webhook receives the complete data:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"data": {
|
||||||
|
"markdown": "...",
|
||||||
|
"html": "...",
|
||||||
|
"links": {...},
|
||||||
|
"metadata": {...}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Webhook Authentication
|
||||||
|
|
||||||
|
Add custom headers for authentication:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl",
|
||||||
|
"webhook_data_in_payload": false,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "your-secret-token",
|
||||||
|
"X-Service-ID": "crawl4ai-prod"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Global Default Webhook
|
||||||
|
|
||||||
|
Configure a default webhook URL in `config.yml` for all jobs:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
webhooks:
|
||||||
|
enabled: true
|
||||||
|
default_url: "https://myapp.com/webhooks/default"
|
||||||
|
data_in_payload: false
|
||||||
|
retry:
|
||||||
|
max_attempts: 5
|
||||||
|
initial_delay_ms: 1000
|
||||||
|
max_delay_ms: 32000
|
||||||
|
timeout_ms: 30000
|
||||||
|
```
|
||||||
|
|
||||||
|
Now jobs without `webhook_config` automatically use the default webhook.
|
||||||
|
|
||||||
|
#### Job Status Polling (Without Webhooks)
|
||||||
|
|
||||||
|
If you prefer polling instead of webhooks, just omit `webhook_config`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Submit job
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"urls": ["https://example.com"]}'
|
||||||
|
# Response: {"task_id": "crawl_xyz"}
|
||||||
|
|
||||||
|
# Poll for status
|
||||||
|
curl http://localhost:11235/crawl/job/crawl_xyz
|
||||||
|
```
|
||||||
|
|
||||||
|
The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`.
|
||||||
|
|
||||||
|
#### LLM Extraction Jobs with Webhooks
|
||||||
|
|
||||||
|
The same webhook system works for LLM extraction jobs via `/llm/job`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Submit LLM extraction job with webhook
|
||||||
|
curl -X POST http://localhost:11235/llm/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"url": "https://example.com/article",
|
||||||
|
"q": "Extract the article title, author, and main points",
|
||||||
|
"provider": "openai/gpt-4o-mini",
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
||||||
|
"webhook_data_in_payload": true,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "your-secret-token"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
|
||||||
|
# Response: {"task_id": "llm_1234567890"}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Your webhook receives:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "llm_1234567890",
|
||||||
|
"task_type": "llm_extraction",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com/article"],
|
||||||
|
"data": {
|
||||||
|
"extracted_content": {
|
||||||
|
"title": "Understanding Web Scraping",
|
||||||
|
"author": "John Doe",
|
||||||
|
"main_points": ["Point 1", "Point 2", "Point 3"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key Differences for LLM Jobs:**
|
||||||
|
- Task type is `"llm_extraction"` instead of `"crawl"`
|
||||||
|
- Extracted data is in `data.extracted_content`
|
||||||
|
- Single URL only (not an array)
|
||||||
|
- Supports schema-based extraction with `schema` parameter
|
||||||
|
|
||||||
|
> 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Metrics & Monitoring
|
## Metrics & Monitoring
|
||||||
@@ -826,10 +1013,11 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
|
|||||||
|
|
||||||
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
||||||
- Building and running the Docker container
|
- Building and running the Docker container
|
||||||
- Configuring the environment
|
- Configuring the environment
|
||||||
- Using the interactive playground for testing
|
- Using the interactive playground for testing
|
||||||
- Making API requests with proper typing
|
- Making API requests with proper typing
|
||||||
- Using the Python SDK
|
- Using the Python SDK
|
||||||
|
- Asynchronous job queues with webhook notifications
|
||||||
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
||||||
- Connecting via the Model Context Protocol (MCP)
|
- Connecting via the Model Context Protocol (MCP)
|
||||||
- Monitoring your deployment
|
- Monitoring your deployment
|
||||||
|
|||||||
378
deploy/docker/WEBHOOK_EXAMPLES.md
Normal file
378
deploy/docker/WEBHOOK_EXAMPLES.md
Normal file
@@ -0,0 +1,378 @@
|
|||||||
|
# Webhook Feature Examples
|
||||||
|
|
||||||
|
This document provides examples of how to use the webhook feature for crawl jobs in Crawl4AI.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The webhook feature allows you to receive notifications when crawl jobs complete, eliminating the need for polling. Webhooks are sent with exponential backoff retry logic to ensure reliable delivery.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Global Configuration (config.yml)
|
||||||
|
|
||||||
|
You can configure default webhook settings in `config.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
webhooks:
|
||||||
|
enabled: true
|
||||||
|
default_url: null # Optional: default webhook URL for all jobs
|
||||||
|
data_in_payload: false # Optional: default behavior for including data
|
||||||
|
retry:
|
||||||
|
max_attempts: 5
|
||||||
|
initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
|
||||||
|
max_delay_ms: 32000
|
||||||
|
timeout_ms: 30000 # 30s timeout per webhook call
|
||||||
|
headers: # Optional: default headers to include
|
||||||
|
User-Agent: "Crawl4AI-Webhook/1.0"
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Usage Examples
|
||||||
|
|
||||||
|
### Example 1: Basic Webhook (Notification Only)
|
||||||
|
|
||||||
|
Send a webhook notification without including the crawl data in the payload.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": false
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Webhook Payload Received:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Your webhook handler should then fetch the results:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:11235/crawl/job/crawl_a1b2c3d4
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 2: Webhook with Data Included
|
||||||
|
|
||||||
|
Include the full crawl results in the webhook payload.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": true
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Webhook Payload Received:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"data": {
|
||||||
|
"markdown": "...",
|
||||||
|
"html": "...",
|
||||||
|
"links": {...},
|
||||||
|
"metadata": {...}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 3: Webhook with Custom Headers
|
||||||
|
|
||||||
|
Include custom headers for authentication or identification.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": false,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "my-secret-token",
|
||||||
|
"X-Service-ID": "crawl4ai-production"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
The webhook will be sent with these additional headers plus the default headers from config.
|
||||||
|
|
||||||
|
### Example 4: Failure Notification
|
||||||
|
|
||||||
|
When a crawl job fails, a webhook is sent with error details.
|
||||||
|
|
||||||
|
**Webhook Payload on Failure:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "failed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"error": "Connection timeout after 30s"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 5: Using Global Default Webhook
|
||||||
|
|
||||||
|
If you set a `default_url` in config.yml, jobs without webhook_config will use it:
|
||||||
|
|
||||||
|
**config.yml:**
|
||||||
|
```yaml
|
||||||
|
webhooks:
|
||||||
|
enabled: true
|
||||||
|
default_url: "https://myapp.com/webhooks/default"
|
||||||
|
data_in_payload: false
|
||||||
|
```
|
||||||
|
|
||||||
|
**Request (no webhook_config needed):**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
The webhook will be sent to the default URL configured in config.yml.
|
||||||
|
|
||||||
|
### Example 6: LLM Extraction Job with Webhook
|
||||||
|
|
||||||
|
Use webhooks with the LLM extraction endpoint for asynchronous processing.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/llm/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"url": "https://example.com/article",
|
||||||
|
"q": "Extract the article title, author, and publication date",
|
||||||
|
"schema": "{\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\"}, \"author\": {\"type\": \"string\"}, \"date\": {\"type\": \"string\"}}}",
|
||||||
|
"cache": false,
|
||||||
|
"provider": "openai/gpt-4o-mini",
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
||||||
|
"webhook_data_in_payload": true
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "llm_1698765432_12345"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Webhook Payload Received:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "llm_1698765432_12345",
|
||||||
|
"task_type": "llm_extraction",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com/article"],
|
||||||
|
"data": {
|
||||||
|
"extracted_content": {
|
||||||
|
"title": "Understanding Web Scraping",
|
||||||
|
"author": "John Doe",
|
||||||
|
"date": "2025-10-21"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Webhook Handler Example
|
||||||
|
|
||||||
|
Here's a simple Python Flask webhook handler that supports both crawl and LLM extraction jobs:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
import requests
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@app.route('/webhooks/crawl-complete', methods=['POST'])
|
||||||
|
def handle_crawl_webhook():
|
||||||
|
payload = request.json
|
||||||
|
|
||||||
|
task_id = payload['task_id']
|
||||||
|
task_type = payload['task_type']
|
||||||
|
status = payload['status']
|
||||||
|
|
||||||
|
if status == 'completed':
|
||||||
|
# If data not in payload, fetch it
|
||||||
|
if 'data' not in payload:
|
||||||
|
# Determine endpoint based on task type
|
||||||
|
endpoint = 'crawl' if task_type == 'crawl' else 'llm'
|
||||||
|
response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
|
||||||
|
data = response.json()
|
||||||
|
else:
|
||||||
|
data = payload['data']
|
||||||
|
|
||||||
|
# Process based on task type
|
||||||
|
if task_type == 'crawl':
|
||||||
|
print(f"Processing crawl results for {task_id}")
|
||||||
|
# Handle crawl results
|
||||||
|
results = data.get('results', [])
|
||||||
|
for result in results:
|
||||||
|
print(f" - {result.get('url')}: {len(result.get('markdown', ''))} chars")
|
||||||
|
|
||||||
|
elif task_type == 'llm_extraction':
|
||||||
|
print(f"Processing LLM extraction for {task_id}")
|
||||||
|
# Handle LLM extraction
|
||||||
|
# Note: Webhook sends 'extracted_content', API returns 'result'
|
||||||
|
extracted = data.get('extracted_content', data.get('result', {}))
|
||||||
|
print(f" - Extracted: {extracted}")
|
||||||
|
|
||||||
|
# Your business logic here...
|
||||||
|
|
||||||
|
elif status == 'failed':
|
||||||
|
error = payload.get('error', 'Unknown error')
|
||||||
|
print(f"{task_type} job {task_id} failed: {error}")
|
||||||
|
# Handle failure...
|
||||||
|
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(port=8080)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Retry Logic
|
||||||
|
|
||||||
|
The webhook delivery service uses exponential backoff retry logic:
|
||||||
|
|
||||||
|
- **Attempts:** Up to 5 attempts by default
|
||||||
|
- **Delays:** 1s → 2s → 4s → 8s → 16s
|
||||||
|
- **Timeout:** 30 seconds per attempt
|
||||||
|
- **Retry Conditions:**
|
||||||
|
- Server errors (5xx status codes)
|
||||||
|
- Network errors
|
||||||
|
- Timeouts
|
||||||
|
- **No Retry:**
|
||||||
|
- Client errors (4xx status codes)
|
||||||
|
- Successful delivery (2xx status codes)
|
||||||
|
|
||||||
|
## Benefits
|
||||||
|
|
||||||
|
1. **No Polling Required** - Eliminates constant API calls to check job status
|
||||||
|
2. **Real-time Notifications** - Immediate notification when jobs complete
|
||||||
|
3. **Reliable Delivery** - Exponential backoff ensures webhooks are delivered
|
||||||
|
4. **Flexible** - Choose between notification-only or full data delivery
|
||||||
|
5. **Secure** - Support for custom headers for authentication
|
||||||
|
6. **Configurable** - Global defaults or per-job configuration
|
||||||
|
7. **Universal Support** - Works with both `/crawl/job` and `/llm/job` endpoints
|
||||||
|
|
||||||
|
## TypeScript Client Example
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
interface WebhookConfig {
|
||||||
|
webhook_url: string;
|
||||||
|
webhook_data_in_payload?: boolean;
|
||||||
|
webhook_headers?: Record<string, string>;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface CrawlJobRequest {
|
||||||
|
urls: string[];
|
||||||
|
browser_config?: Record<string, any>;
|
||||||
|
crawler_config?: Record<string, any>;
|
||||||
|
webhook_config?: WebhookConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface LLMJobRequest {
|
||||||
|
url: string;
|
||||||
|
q: string;
|
||||||
|
schema?: string;
|
||||||
|
cache?: boolean;
|
||||||
|
provider?: string;
|
||||||
|
webhook_config?: WebhookConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createCrawlJob(request: CrawlJobRequest) {
|
||||||
|
const response = await fetch('http://localhost:11235/crawl/job', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify(request)
|
||||||
|
});
|
||||||
|
|
||||||
|
const { task_id } = await response.json();
|
||||||
|
return task_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createLLMJob(request: LLMJobRequest) {
|
||||||
|
const response = await fetch('http://localhost:11235/llm/job', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify(request)
|
||||||
|
});
|
||||||
|
|
||||||
|
const { task_id } = await response.json();
|
||||||
|
return task_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Usage - Crawl Job
|
||||||
|
const crawlTaskId = await createCrawlJob({
|
||||||
|
urls: ['https://example.com'],
|
||||||
|
webhook_config: {
|
||||||
|
webhook_url: 'https://myapp.com/webhooks/crawl-complete',
|
||||||
|
webhook_data_in_payload: false,
|
||||||
|
webhook_headers: {
|
||||||
|
'X-Webhook-Secret': 'my-secret'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Usage - LLM Extraction Job
|
||||||
|
const llmTaskId = await createLLMJob({
|
||||||
|
url: 'https://example.com/article',
|
||||||
|
q: 'Extract the main points from this article',
|
||||||
|
provider: 'openai/gpt-4o-mini',
|
||||||
|
webhook_config: {
|
||||||
|
webhook_url: 'https://myapp.com/webhooks/llm-complete',
|
||||||
|
webhook_data_in_payload: true,
|
||||||
|
webhook_headers: {
|
||||||
|
'X-Webhook-Secret': 'my-secret'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring and Debugging
|
||||||
|
|
||||||
|
Webhook delivery attempts are logged at INFO level:
|
||||||
|
- Successful deliveries
|
||||||
|
- Retry attempts with delays
|
||||||
|
- Final failures after max attempts
|
||||||
|
|
||||||
|
Check the application logs for webhook delivery status:
|
||||||
|
```bash
|
||||||
|
docker logs crawl4ai-container | grep -i webhook
|
||||||
|
```
|
||||||
@@ -46,6 +46,7 @@ from utils import (
|
|||||||
get_llm_temperature,
|
get_llm_temperature,
|
||||||
get_llm_base_url
|
get_llm_base_url
|
||||||
)
|
)
|
||||||
|
from webhook import WebhookDeliveryService
|
||||||
|
|
||||||
import psutil, time
|
import psutil, time
|
||||||
|
|
||||||
@@ -127,10 +128,14 @@ async def process_llm_extraction(
|
|||||||
schema: Optional[str] = None,
|
schema: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
provider: Optional[str] = None,
|
provider: Optional[str] = None,
|
||||||
|
webhook_config: Optional[Dict] = None,
|
||||||
temperature: Optional[float] = None,
|
temperature: Optional[float] = None,
|
||||||
base_url: Optional[str] = None
|
base_url: Optional[str] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Process LLM extraction in background."""
|
"""Process LLM extraction in background."""
|
||||||
|
# Initialize webhook service
|
||||||
|
webhook_service = WebhookDeliveryService(config)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Validate provider
|
# Validate provider
|
||||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||||
@@ -139,6 +144,16 @@ async def process_llm_extraction(
|
|||||||
"status": TaskStatus.FAILED,
|
"status": TaskStatus.FAILED,
|
||||||
"error": error_msg
|
"error": error_msg
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on failure
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="llm_extraction",
|
||||||
|
status="failed",
|
||||||
|
urls=[url],
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
error=error_msg
|
||||||
|
)
|
||||||
return
|
return
|
||||||
api_key = get_llm_api_key(config, provider) # Returns None to let litellm handle it
|
api_key = get_llm_api_key(config, provider) # Returns None to let litellm handle it
|
||||||
llm_strategy = LLMExtractionStrategy(
|
llm_strategy = LLMExtractionStrategy(
|
||||||
@@ -169,17 +184,40 @@ async def process_llm_extraction(
|
|||||||
"status": TaskStatus.FAILED,
|
"status": TaskStatus.FAILED,
|
||||||
"error": result.error_message
|
"error": result.error_message
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on failure
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="llm_extraction",
|
||||||
|
status="failed",
|
||||||
|
urls=[url],
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
error=result.error_message
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content = json.loads(result.extracted_content)
|
content = json.loads(result.extracted_content)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
content = result.extracted_content
|
content = result.extracted_content
|
||||||
|
|
||||||
|
result_data = {"extracted_content": content}
|
||||||
|
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
"status": TaskStatus.COMPLETED,
|
"status": TaskStatus.COMPLETED,
|
||||||
"result": json.dumps(content)
|
"result": json.dumps(content)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on successful completion
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="llm_extraction",
|
||||||
|
status="completed",
|
||||||
|
urls=[url],
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
result=result_data
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
|
logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
@@ -187,6 +225,16 @@ async def process_llm_extraction(
|
|||||||
"error": str(e)
|
"error": str(e)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on failure
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="llm_extraction",
|
||||||
|
status="failed",
|
||||||
|
urls=[url],
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
error=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
async def handle_markdown_request(
|
async def handle_markdown_request(
|
||||||
url: str,
|
url: str,
|
||||||
filter_type: FilterType,
|
filter_type: FilterType,
|
||||||
@@ -275,6 +323,7 @@ async def handle_llm_request(
|
|||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
config: Optional[dict] = None,
|
config: Optional[dict] = None,
|
||||||
provider: Optional[str] = None,
|
provider: Optional[str] = None,
|
||||||
|
webhook_config: Optional[Dict] = None,
|
||||||
temperature: Optional[float] = None,
|
temperature: Optional[float] = None,
|
||||||
api_base_url: Optional[str] = None
|
api_base_url: Optional[str] = None
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
@@ -308,6 +357,7 @@ async def handle_llm_request(
|
|||||||
base_url,
|
base_url,
|
||||||
config,
|
config,
|
||||||
provider,
|
provider,
|
||||||
|
webhook_config,
|
||||||
temperature,
|
temperature,
|
||||||
api_base_url
|
api_base_url
|
||||||
)
|
)
|
||||||
@@ -355,6 +405,7 @@ async def create_new_task(
|
|||||||
base_url: str,
|
base_url: str,
|
||||||
config: dict,
|
config: dict,
|
||||||
provider: Optional[str] = None,
|
provider: Optional[str] = None,
|
||||||
|
webhook_config: Optional[Dict] = None,
|
||||||
temperature: Optional[float] = None,
|
temperature: Optional[float] = None,
|
||||||
api_base_url: Optional[str] = None
|
api_base_url: Optional[str] = None
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
@@ -365,12 +416,18 @@ async def create_new_task(
|
|||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
|
task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
|
||||||
|
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
task_data = {
|
||||||
"status": TaskStatus.PROCESSING,
|
"status": TaskStatus.PROCESSING,
|
||||||
"created_at": datetime.now().isoformat(),
|
"created_at": datetime.now().isoformat(),
|
||||||
"url": decoded_url
|
"url": decoded_url
|
||||||
})
|
}
|
||||||
|
|
||||||
|
# Store webhook config if provided
|
||||||
|
if webhook_config:
|
||||||
|
task_data["webhook_config"] = json.dumps(webhook_config)
|
||||||
|
|
||||||
|
await redis.hset(f"task:{task_id}", mapping=task_data)
|
||||||
|
|
||||||
background_tasks.add_task(
|
background_tasks.add_task(
|
||||||
process_llm_extraction,
|
process_llm_extraction,
|
||||||
@@ -382,6 +439,7 @@ async def create_new_task(
|
|||||||
schema,
|
schema,
|
||||||
cache,
|
cache,
|
||||||
provider,
|
provider,
|
||||||
|
webhook_config,
|
||||||
temperature,
|
temperature,
|
||||||
api_base_url
|
api_base_url
|
||||||
)
|
)
|
||||||
@@ -723,6 +781,7 @@ async def handle_crawl_job(
|
|||||||
browser_config: Dict,
|
browser_config: Dict,
|
||||||
crawler_config: Dict,
|
crawler_config: Dict,
|
||||||
config: Dict,
|
config: Dict,
|
||||||
|
webhook_config: Optional[Dict] = None,
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Fire-and-forget version of handle_crawl_request.
|
Fire-and-forget version of handle_crawl_request.
|
||||||
@@ -730,13 +789,24 @@ async def handle_crawl_job(
|
|||||||
lets /crawl/job/{task_id} polling fetch the result.
|
lets /crawl/job/{task_id} polling fetch the result.
|
||||||
"""
|
"""
|
||||||
task_id = f"crawl_{uuid4().hex[:8]}"
|
task_id = f"crawl_{uuid4().hex[:8]}"
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
|
||||||
|
# Store task data in Redis
|
||||||
|
task_data = {
|
||||||
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
|
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
|
||||||
"created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
|
"created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
|
||||||
"url": json.dumps(urls), # store list as JSON string
|
"url": json.dumps(urls), # store list as JSON string
|
||||||
"result": "",
|
"result": "",
|
||||||
"error": "",
|
"error": "",
|
||||||
})
|
}
|
||||||
|
|
||||||
|
# Store webhook config if provided
|
||||||
|
if webhook_config:
|
||||||
|
task_data["webhook_config"] = json.dumps(webhook_config)
|
||||||
|
|
||||||
|
await redis.hset(f"task:{task_id}", mapping=task_data)
|
||||||
|
|
||||||
|
# Initialize webhook service
|
||||||
|
webhook_service = WebhookDeliveryService(config)
|
||||||
|
|
||||||
async def _runner():
|
async def _runner():
|
||||||
try:
|
try:
|
||||||
@@ -750,6 +820,17 @@ async def handle_crawl_job(
|
|||||||
"status": TaskStatus.COMPLETED,
|
"status": TaskStatus.COMPLETED,
|
||||||
"result": json.dumps(result),
|
"result": json.dumps(result),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on successful completion
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="crawl",
|
||||||
|
status="completed",
|
||||||
|
urls=urls,
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
result=result
|
||||||
|
)
|
||||||
|
|
||||||
await asyncio.sleep(5) # Give Redis time to process the update
|
await asyncio.sleep(5) # Give Redis time to process the update
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
@@ -757,5 +838,15 @@ async def handle_crawl_job(
|
|||||||
"error": str(exc),
|
"error": str(exc),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on failure
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="crawl",
|
||||||
|
status="failed",
|
||||||
|
urls=urls,
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
error=str(exc)
|
||||||
|
)
|
||||||
|
|
||||||
background_tasks.add_task(_runner)
|
background_tasks.add_task(_runner)
|
||||||
return {"task_id": task_id}
|
return {"task_id": task_id}
|
||||||
@@ -87,4 +87,17 @@ observability:
|
|||||||
enabled: True
|
enabled: True
|
||||||
endpoint: "/metrics"
|
endpoint: "/metrics"
|
||||||
health_check:
|
health_check:
|
||||||
endpoint: "/health"
|
endpoint: "/health"
|
||||||
|
|
||||||
|
# Webhook Configuration
|
||||||
|
webhooks:
|
||||||
|
enabled: true
|
||||||
|
default_url: null # Optional: default webhook URL for all jobs
|
||||||
|
data_in_payload: false # Optional: default behavior for including data
|
||||||
|
retry:
|
||||||
|
max_attempts: 5
|
||||||
|
initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
|
||||||
|
max_delay_ms: 32000
|
||||||
|
timeout_ms: 30000 # 30s timeout per webhook call
|
||||||
|
headers: # Optional: default headers to include
|
||||||
|
User-Agent: "Crawl4AI-Webhook/1.0"
|
||||||
@@ -12,6 +12,7 @@ from api import (
|
|||||||
handle_crawl_job,
|
handle_crawl_job,
|
||||||
handle_task_status,
|
handle_task_status,
|
||||||
)
|
)
|
||||||
|
from schemas import WebhookConfig
|
||||||
|
|
||||||
# ------------- dependency placeholders -------------
|
# ------------- dependency placeholders -------------
|
||||||
_redis = None # will be injected from server.py
|
_redis = None # will be injected from server.py
|
||||||
@@ -37,6 +38,7 @@ class LlmJobPayload(BaseModel):
|
|||||||
schema: Optional[str] = None
|
schema: Optional[str] = None
|
||||||
cache: bool = False
|
cache: bool = False
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None
|
||||||
|
webhook_config: Optional[WebhookConfig] = None
|
||||||
temperature: Optional[float] = None
|
temperature: Optional[float] = None
|
||||||
base_url: Optional[str] = None
|
base_url: Optional[str] = None
|
||||||
|
|
||||||
@@ -45,6 +47,7 @@ class CrawlJobPayload(BaseModel):
|
|||||||
urls: list[HttpUrl]
|
urls: list[HttpUrl]
|
||||||
browser_config: Dict = {}
|
browser_config: Dict = {}
|
||||||
crawler_config: Dict = {}
|
crawler_config: Dict = {}
|
||||||
|
webhook_config: Optional[WebhookConfig] = None
|
||||||
|
|
||||||
|
|
||||||
# ---------- LLM job ---------------------------------------------------------
|
# ---------- LLM job ---------------------------------------------------------
|
||||||
@@ -55,6 +58,10 @@ async def llm_job_enqueue(
|
|||||||
request: Request,
|
request: Request,
|
||||||
_td: Dict = Depends(lambda: _token_dep()), # late-bound dep
|
_td: Dict = Depends(lambda: _token_dep()), # late-bound dep
|
||||||
):
|
):
|
||||||
|
webhook_config = None
|
||||||
|
if payload.webhook_config:
|
||||||
|
webhook_config = payload.webhook_config.model_dump(mode='json')
|
||||||
|
|
||||||
return await handle_llm_request(
|
return await handle_llm_request(
|
||||||
_redis,
|
_redis,
|
||||||
background_tasks,
|
background_tasks,
|
||||||
@@ -65,6 +72,7 @@ async def llm_job_enqueue(
|
|||||||
cache=payload.cache,
|
cache=payload.cache,
|
||||||
config=_config,
|
config=_config,
|
||||||
provider=payload.provider,
|
provider=payload.provider,
|
||||||
|
webhook_config=webhook_config,
|
||||||
temperature=payload.temperature,
|
temperature=payload.temperature,
|
||||||
api_base_url=payload.base_url,
|
api_base_url=payload.base_url,
|
||||||
)
|
)
|
||||||
@@ -86,6 +94,10 @@ async def crawl_job_enqueue(
|
|||||||
background_tasks: BackgroundTasks,
|
background_tasks: BackgroundTasks,
|
||||||
_td: Dict = Depends(lambda: _token_dep()),
|
_td: Dict = Depends(lambda: _token_dep()),
|
||||||
):
|
):
|
||||||
|
webhook_config = None
|
||||||
|
if payload.webhook_config:
|
||||||
|
webhook_config = payload.webhook_config.model_dump(mode='json')
|
||||||
|
|
||||||
return await handle_crawl_job(
|
return await handle_crawl_job(
|
||||||
_redis,
|
_redis,
|
||||||
background_tasks,
|
background_tasks,
|
||||||
@@ -93,6 +105,7 @@ async def crawl_job_enqueue(
|
|||||||
payload.browser_config,
|
payload.browser_config,
|
||||||
payload.crawler_config,
|
payload.crawler_config,
|
||||||
config=_config,
|
config=_config,
|
||||||
|
webhook_config=webhook_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,6 @@ pydantic>=2.11
|
|||||||
rank-bm25==0.2.2
|
rank-bm25==0.2.2
|
||||||
anyio==4.9.0
|
anyio==4.9.0
|
||||||
PyJWT==2.10.1
|
PyJWT==2.10.1
|
||||||
mcp>=1.6.0
|
mcp>=1.18.0
|
||||||
websockets>=15.0.1
|
websockets>=15.0.1
|
||||||
httpx[http2]>=0.27.2
|
httpx[http2]>=0.27.2
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from typing import List, Optional, Dict
|
from typing import List, Optional, Dict
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field, HttpUrl
|
||||||
from utils import FilterType
|
from utils import FilterType
|
||||||
|
|
||||||
|
|
||||||
@@ -85,4 +85,22 @@ class JSEndpointRequest(BaseModel):
|
|||||||
scripts: List[str] = Field(
|
scripts: List[str] = Field(
|
||||||
...,
|
...,
|
||||||
description="List of separated JavaScript snippets to execute"
|
description="List of separated JavaScript snippets to execute"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class WebhookConfig(BaseModel):
|
||||||
|
"""Configuration for webhook notifications."""
|
||||||
|
webhook_url: HttpUrl
|
||||||
|
webhook_data_in_payload: bool = False
|
||||||
|
webhook_headers: Optional[Dict[str, str]] = None
|
||||||
|
|
||||||
|
|
||||||
|
class WebhookPayload(BaseModel):
|
||||||
|
"""Payload sent to webhook endpoints."""
|
||||||
|
task_id: str
|
||||||
|
task_type: str # "crawl", "llm_extraction", etc.
|
||||||
|
status: str # "completed" or "failed"
|
||||||
|
timestamp: str # ISO 8601 format
|
||||||
|
urls: List[str]
|
||||||
|
error: Optional[str] = None
|
||||||
|
data: Optional[Dict] = None # Included only if webhook_data_in_payload=True
|
||||||
159
deploy/docker/webhook.py
Normal file
159
deploy/docker/webhook.py
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
"""
|
||||||
|
Webhook delivery service for Crawl4AI.
|
||||||
|
|
||||||
|
This module provides webhook notification functionality with exponential backoff retry logic.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import httpx
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Optional
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class WebhookDeliveryService:
|
||||||
|
"""Handles webhook delivery with exponential backoff retry logic."""
|
||||||
|
|
||||||
|
def __init__(self, config: Dict):
|
||||||
|
"""
|
||||||
|
Initialize the webhook delivery service.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Application configuration dictionary containing webhook settings
|
||||||
|
"""
|
||||||
|
self.config = config.get("webhooks", {})
|
||||||
|
self.max_attempts = self.config.get("retry", {}).get("max_attempts", 5)
|
||||||
|
self.initial_delay = self.config.get("retry", {}).get("initial_delay_ms", 1000) / 1000
|
||||||
|
self.max_delay = self.config.get("retry", {}).get("max_delay_ms", 32000) / 1000
|
||||||
|
self.timeout = self.config.get("retry", {}).get("timeout_ms", 30000) / 1000
|
||||||
|
|
||||||
|
async def send_webhook(
|
||||||
|
self,
|
||||||
|
webhook_url: str,
|
||||||
|
payload: Dict,
|
||||||
|
headers: Optional[Dict[str, str]] = None
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Send webhook with exponential backoff retry logic.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
webhook_url: The URL to send the webhook to
|
||||||
|
payload: The JSON payload to send
|
||||||
|
headers: Optional custom headers
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if delivered successfully, False otherwise
|
||||||
|
"""
|
||||||
|
default_headers = self.config.get("headers", {})
|
||||||
|
merged_headers = {**default_headers, **(headers or {})}
|
||||||
|
merged_headers["Content-Type"] = "application/json"
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||||
|
for attempt in range(self.max_attempts):
|
||||||
|
try:
|
||||||
|
logger.info(
|
||||||
|
f"Sending webhook (attempt {attempt + 1}/{self.max_attempts}) to {webhook_url}"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
webhook_url,
|
||||||
|
json=payload,
|
||||||
|
headers=merged_headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# Success or client error (don't retry client errors)
|
||||||
|
if response.status_code < 500:
|
||||||
|
if 200 <= response.status_code < 300:
|
||||||
|
logger.info(f"Webhook delivered successfully to {webhook_url}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"Webhook rejected with status {response.status_code}: {response.text[:200]}"
|
||||||
|
)
|
||||||
|
return False # Client error - don't retry
|
||||||
|
|
||||||
|
# Server error - retry with backoff
|
||||||
|
logger.warning(
|
||||||
|
f"Webhook failed with status {response.status_code}, will retry"
|
||||||
|
)
|
||||||
|
|
||||||
|
except httpx.TimeoutException as exc:
|
||||||
|
logger.error(f"Webhook timeout (attempt {attempt + 1}): {exc}")
|
||||||
|
except httpx.RequestError as exc:
|
||||||
|
logger.error(f"Webhook request error (attempt {attempt + 1}): {exc}")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(f"Webhook delivery error (attempt {attempt + 1}): {exc}")
|
||||||
|
|
||||||
|
# Calculate exponential backoff delay
|
||||||
|
if attempt < self.max_attempts - 1:
|
||||||
|
delay = min(self.initial_delay * (2 ** attempt), self.max_delay)
|
||||||
|
logger.info(f"Retrying in {delay}s...")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
|
||||||
|
logger.error(
|
||||||
|
f"Webhook delivery failed after {self.max_attempts} attempts to {webhook_url}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def notify_job_completion(
|
||||||
|
self,
|
||||||
|
task_id: str,
|
||||||
|
task_type: str,
|
||||||
|
status: str,
|
||||||
|
urls: list,
|
||||||
|
webhook_config: Optional[Dict],
|
||||||
|
result: Optional[Dict] = None,
|
||||||
|
error: Optional[str] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Notify webhook of job completion.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task_id: The task identifier
|
||||||
|
task_type: Type of task (e.g., "crawl", "llm_extraction")
|
||||||
|
status: Task status ("completed" or "failed")
|
||||||
|
urls: List of URLs that were crawled
|
||||||
|
webhook_config: Webhook configuration from the job request
|
||||||
|
result: Optional crawl result data
|
||||||
|
error: Optional error message if failed
|
||||||
|
"""
|
||||||
|
# Determine webhook URL
|
||||||
|
webhook_url = None
|
||||||
|
data_in_payload = self.config.get("data_in_payload", False)
|
||||||
|
custom_headers = None
|
||||||
|
|
||||||
|
if webhook_config:
|
||||||
|
webhook_url = webhook_config.get("webhook_url")
|
||||||
|
data_in_payload = webhook_config.get("webhook_data_in_payload", data_in_payload)
|
||||||
|
custom_headers = webhook_config.get("webhook_headers")
|
||||||
|
|
||||||
|
if not webhook_url:
|
||||||
|
webhook_url = self.config.get("default_url")
|
||||||
|
|
||||||
|
if not webhook_url:
|
||||||
|
logger.debug("No webhook URL configured, skipping notification")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if webhooks are enabled
|
||||||
|
if not self.config.get("enabled", True):
|
||||||
|
logger.debug("Webhooks are disabled, skipping notification")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Build payload
|
||||||
|
payload = {
|
||||||
|
"task_id": task_id,
|
||||||
|
"task_type": task_type,
|
||||||
|
"status": status,
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"urls": urls
|
||||||
|
}
|
||||||
|
|
||||||
|
if error:
|
||||||
|
payload["error"] = error
|
||||||
|
|
||||||
|
if data_in_payload and result:
|
||||||
|
payload["data"] = result
|
||||||
|
|
||||||
|
# Send webhook (fire and forget - don't block on completion)
|
||||||
|
await self.send_webhook(webhook_url, payload, custom_headers)
|
||||||
@@ -10,7 +10,6 @@ Today I'm releasing Crawl4AI v0.7.4—the Intelligent Table Extraction & Perform
|
|||||||
|
|
||||||
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables
|
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables
|
||||||
- **⚡ Enhanced Concurrency**: True concurrency improvements for fast-completing tasks in batch operations
|
- **⚡ Enhanced Concurrency**: True concurrency improvements for fast-completing tasks in batch operations
|
||||||
- **🧹 Memory Management Refactor**: Streamlined memory utilities and better resource management
|
|
||||||
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation
|
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation
|
||||||
- **⌨️ Cross-Platform Browser Profiler**: Improved keyboard handling and quit mechanisms
|
- **⌨️ Cross-Platform Browser Profiler**: Improved keyboard handling and quit mechanisms
|
||||||
- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution
|
- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution
|
||||||
@@ -158,40 +157,6 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
- **Monitoring Systems**: Faster health checks and status page monitoring
|
- **Monitoring Systems**: Faster health checks and status page monitoring
|
||||||
- **Data Aggregation**: Improved performance for real-time data collection
|
- **Data Aggregation**: Improved performance for real-time data collection
|
||||||
|
|
||||||
## 🧹 Memory Management Refactor: Cleaner Architecture
|
|
||||||
|
|
||||||
**The Problem:** Memory utilities were scattered and difficult to maintain, with potential import conflicts and unclear organization.
|
|
||||||
|
|
||||||
**My Solution:** I consolidated all memory-related utilities into the main `utils.py` module, creating a cleaner, more maintainable architecture.
|
|
||||||
|
|
||||||
### Improved Memory Handling
|
|
||||||
|
|
||||||
```python
|
|
||||||
# All memory utilities now consolidated
|
|
||||||
from crawl4ai.utils import get_true_memory_usage_percent, MemoryMonitor
|
|
||||||
|
|
||||||
# Enhanced memory monitoring
|
|
||||||
monitor = MemoryMonitor()
|
|
||||||
monitor.start_monitoring()
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
# Memory-efficient batch processing
|
|
||||||
results = await crawler.arun_many(large_url_list)
|
|
||||||
|
|
||||||
# Get accurate memory metrics
|
|
||||||
memory_usage = get_true_memory_usage_percent()
|
|
||||||
memory_report = monitor.get_report()
|
|
||||||
|
|
||||||
print(f"Memory efficiency: {memory_report['efficiency']:.1f}%")
|
|
||||||
print(f"Peak usage: {memory_report['peak_mb']:.1f} MB")
|
|
||||||
```
|
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
|
||||||
- **Production Stability**: More reliable memory tracking and management
|
|
||||||
- **Code Maintainability**: Cleaner architecture for easier debugging
|
|
||||||
- **Import Clarity**: Resolved potential conflicts and import issues
|
|
||||||
- **Developer Experience**: Simpler API for memory monitoring
|
|
||||||
|
|
||||||
## 🔧 Critical Stability Fixes
|
## 🔧 Critical Stability Fixes
|
||||||
|
|
||||||
### Browser Manager Race Condition Resolution
|
### Browser Manager Race Condition Resolution
|
||||||
|
|||||||
318
docs/blog/release-v0.7.5.md
Normal file
318
docs/blog/release-v0.7.5.md
Normal file
@@ -0,0 +1,318 @@
|
|||||||
|
# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update
|
||||||
|
|
||||||
|
*September 29, 2025 • 8 min read*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements.
|
||||||
|
|
||||||
|
## 🎯 What's New at a Glance
|
||||||
|
|
||||||
|
- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API
|
||||||
|
- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion
|
||||||
|
- **Enhanced LLM Integration**: Custom providers with temperature control
|
||||||
|
- **HTTPS Preservation**: Secure internal link handling
|
||||||
|
- **Bug Fixes**: Resolved multiple community-reported issues
|
||||||
|
- **Improved Docker Error Handling**: Better debugging and reliability
|
||||||
|
|
||||||
|
## 🔧 Docker Hooks System: Pipeline Customization
|
||||||
|
|
||||||
|
Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline.
|
||||||
|
|
||||||
|
### Real Example: Authentication & Performance
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Real working hooks for httpbin.org
|
||||||
|
hooks_config = {
|
||||||
|
"on_page_context_created": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("Hook: Setting up page context")
|
||||||
|
# Block images to speed up crawling
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
print("Hook: Images blocked")
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_retrieve_html": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("Hook: Before retrieving HTML")
|
||||||
|
# Scroll to bottom to load lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
print("Hook: Scrolled to bottom")
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_goto": """
|
||||||
|
async def hook(page, context, url, **kwargs):
|
||||||
|
print(f"Hook: About to navigate to {url}")
|
||||||
|
# Add custom headers
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Test-Header': 'crawl4ai-hooks-test'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test with Docker API
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"hooks": {
|
||||||
|
"code": hooks_config,
|
||||||
|
"timeout": 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
if result.get('success'):
|
||||||
|
print("✅ Hooks executed successfully!")
|
||||||
|
print(f"Content length: {len(result.get('markdown', ''))} characters")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Available Hook Points:**
|
||||||
|
- `on_browser_created`: Browser setup
|
||||||
|
- `on_page_context_created`: Page context configuration
|
||||||
|
- `before_goto`: Pre-navigation setup
|
||||||
|
- `after_goto`: Post-navigation processing
|
||||||
|
- `on_user_agent_updated`: User agent changes
|
||||||
|
- `on_execution_started`: Crawl initialization
|
||||||
|
- `before_retrieve_html`: Pre-extraction processing
|
||||||
|
- `before_return_html`: Final HTML processing
|
||||||
|
|
||||||
|
### Function-Based Hooks API
|
||||||
|
|
||||||
|
Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion!
|
||||||
|
|
||||||
|
**Option 1: Using the `hooks_to_string()` Utility**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import hooks_to_string
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Define hooks as regular Python functions (with full IDE support!)
|
||||||
|
async def on_page_context_created(page, context, **kwargs):
|
||||||
|
"""Block images to speed up crawling"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_goto(page, context, url, **kwargs):
|
||||||
|
"""Add custom headers"""
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Crawl4AI': 'v0.7.5',
|
||||||
|
'X-Custom-Header': 'my-value'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Convert functions to strings
|
||||||
|
hooks_code = hooks_to_string({
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_goto": before_goto
|
||||||
|
})
|
||||||
|
|
||||||
|
# Use with REST API
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"hooks": {"code": hooks_code, "timeout": 30}
|
||||||
|
}
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Option 2: Docker Client with Automatic Conversion (Recommended!)**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
# Define hooks as functions (same as above)
|
||||||
|
async def on_page_context_created(page, context, **kwargs):
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_retrieve_html(page, context, **kwargs):
|
||||||
|
# Scroll to load lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Use Docker client - conversion happens automatically!
|
||||||
|
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||||
|
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=["https://httpbin.org/html"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_retrieve_html": before_retrieve_html
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if results and results.success:
|
||||||
|
print(f"✅ Hooks executed! HTML length: {len(results.html)}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits of Function-Based Hooks:**
|
||||||
|
- ✅ Full IDE support (autocomplete, syntax highlighting)
|
||||||
|
- ✅ Type checking and linting
|
||||||
|
- ✅ Easier to test and debug
|
||||||
|
- ✅ Reusable across projects
|
||||||
|
- ✅ Automatic conversion in Docker client
|
||||||
|
- ✅ No breaking changes - string hooks still work!
|
||||||
|
|
||||||
|
## 🤖 Enhanced LLM Integration
|
||||||
|
|
||||||
|
Enhanced LLM integration with custom providers, temperature control, and base URL configuration.
|
||||||
|
|
||||||
|
### Multi-Provider Support
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
|
|
||||||
|
# Test with different providers
|
||||||
|
async def test_llm_providers():
|
||||||
|
# OpenAI with custom temperature
|
||||||
|
openai_strategy = LLMExtractionStrategy(
|
||||||
|
provider="gemini/gemini-2.5-flash-lite",
|
||||||
|
api_token="your-api-token",
|
||||||
|
temperature=0.7, # New in v0.7.5
|
||||||
|
instruction="Summarize this page in one sentence"
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://example.com",
|
||||||
|
config=CrawlerRunConfig(extraction_strategy=openai_strategy)
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
print("✅ LLM extraction completed")
|
||||||
|
print(result.extracted_content)
|
||||||
|
|
||||||
|
# Docker API with enhanced LLM config
|
||||||
|
llm_payload = {
|
||||||
|
"url": "https://example.com",
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Summarize this page in one sentence.",
|
||||||
|
"provider": "gemini/gemini-2.5-flash-lite",
|
||||||
|
"temperature": 0.7
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("http://localhost:11235/md", json=llm_payload)
|
||||||
|
```
|
||||||
|
|
||||||
|
**New Features:**
|
||||||
|
- Custom `temperature` parameter for creativity control
|
||||||
|
- `base_url` for custom API endpoints
|
||||||
|
- Multi-provider environment variable support
|
||||||
|
- Docker API integration
|
||||||
|
|
||||||
|
## 🔒 HTTPS Preservation
|
||||||
|
|
||||||
|
**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear.
|
||||||
|
|
||||||
|
**Solution:** HTTPS preservation maintains secure protocols throughout crawling.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy
|
||||||
|
|
||||||
|
async def test_https_preservation():
|
||||||
|
# Enable HTTPS preservation
|
||||||
|
url_filter = URLPatternFilter(
|
||||||
|
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
||||||
|
)
|
||||||
|
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
exclude_external_links=True,
|
||||||
|
preserve_https_for_internal_links=True, # New in v0.7.5
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2,
|
||||||
|
max_pages=5,
|
||||||
|
filter_chain=FilterChain([url_filter])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
async for result in await crawler.arun(
|
||||||
|
url="https://quotes.toscrape.com",
|
||||||
|
config=config
|
||||||
|
):
|
||||||
|
# All internal links maintain HTTPS
|
||||||
|
internal_links = [link['href'] for link in result.links['internal']]
|
||||||
|
https_links = [link for link in internal_links if link.startswith('https://')]
|
||||||
|
|
||||||
|
print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}")
|
||||||
|
for link in https_links[:3]:
|
||||||
|
print(f" → {link}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🛠️ Bug Fixes and Improvements
|
||||||
|
|
||||||
|
### Major Fixes
|
||||||
|
- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332)
|
||||||
|
- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated)
|
||||||
|
- **Docker Error Handling**: Comprehensive error messages with status codes
|
||||||
|
- **Memory Management**: Fixed leaks in long-running sessions
|
||||||
|
- **JWT Authentication**: Fixed Docker JWT validation issues (#1442)
|
||||||
|
- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481)
|
||||||
|
- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505)
|
||||||
|
- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419)
|
||||||
|
- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291)
|
||||||
|
- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989)
|
||||||
|
|
||||||
|
### Community-Reported Issues Fixed
|
||||||
|
This release addresses multiple issues reported by the community through GitHub issues and Discord discussions:
|
||||||
|
- Fixed browser configuration reference errors
|
||||||
|
- Resolved dependency conflicts with cssselect
|
||||||
|
- Improved error messaging for failed authentications
|
||||||
|
- Enhanced compatibility with various proxy configurations
|
||||||
|
- Fixed edge cases in URL normalization
|
||||||
|
|
||||||
|
### Configuration Updates
|
||||||
|
```python
|
||||||
|
# Old proxy config (deprecated)
|
||||||
|
# browser_config = BrowserConfig(proxy="http://proxy:8080")
|
||||||
|
|
||||||
|
# New enhanced proxy config
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
proxy_config={
|
||||||
|
"server": "http://proxy:8080",
|
||||||
|
"username": "optional-user",
|
||||||
|
"password": "optional-pass"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🔄 Breaking Changes
|
||||||
|
|
||||||
|
1. **Python 3.10+ Required**: Upgrade from Python 3.9
|
||||||
|
2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure
|
||||||
|
3. **New Dependency**: Added `cssselect` for better CSS handling
|
||||||
|
|
||||||
|
## 🚀 Get Started
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install latest version
|
||||||
|
pip install crawl4ai==0.7.5
|
||||||
|
|
||||||
|
# Docker deployment
|
||||||
|
docker pull unclecode/crawl4ai:latest
|
||||||
|
docker run -p 11235:11235 unclecode/crawl4ai:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
**Try the Demo:**
|
||||||
|
```bash
|
||||||
|
# Run working examples
|
||||||
|
python docs/releases_review/demo_v0.7.5.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Resources:**
|
||||||
|
- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com)
|
||||||
|
- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||||
|
- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||||
|
- 🐦 Twitter: [@unclecode](https://x.com/unclecode)
|
||||||
|
|
||||||
|
Happy crawling! 🕷️
|
||||||
314
docs/blog/release-v0.7.6.md
Normal file
314
docs/blog/release-v0.7.6.md
Normal file
@@ -0,0 +1,314 @@
|
|||||||
|
# Crawl4AI v0.7.6 Release Notes
|
||||||
|
|
||||||
|
*Release Date: October 22, 2025*
|
||||||
|
|
||||||
|
I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows.
|
||||||
|
|
||||||
|
## 🎯 What's New
|
||||||
|
|
||||||
|
### Webhook Support for Docker Job Queue API
|
||||||
|
|
||||||
|
The headline feature of v0.7.6 is comprehensive webhook support for asynchronous job processing. No more constant polling to check if your jobs are done - get instant notifications when they complete!
|
||||||
|
|
||||||
|
**Key Capabilities:**
|
||||||
|
|
||||||
|
- ✅ **Universal Webhook Support**: Both `/crawl/job` and `/llm/job` endpoints now support webhooks
|
||||||
|
- ✅ **Flexible Delivery Modes**: Choose notification-only or include full data in the webhook payload
|
||||||
|
- ✅ **Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
||||||
|
- ✅ **Custom Authentication**: Add custom headers for webhook authentication
|
||||||
|
- ✅ **Global Configuration**: Set default webhook URL in `config.yml` for all jobs
|
||||||
|
- ✅ **Task Type Identification**: Distinguish between `crawl` and `llm_extraction` tasks
|
||||||
|
|
||||||
|
### How It Works
|
||||||
|
|
||||||
|
Instead of constantly checking job status:
|
||||||
|
|
||||||
|
**OLD WAY (Polling):**
|
||||||
|
```python
|
||||||
|
# Submit job
|
||||||
|
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
||||||
|
task_id = response.json()['task_id']
|
||||||
|
|
||||||
|
# Poll until complete
|
||||||
|
while True:
|
||||||
|
status = requests.get(f"http://localhost:11235/crawl/job/{task_id}")
|
||||||
|
if status.json()['status'] == 'completed':
|
||||||
|
break
|
||||||
|
time.sleep(5) # Wait and try again
|
||||||
|
```
|
||||||
|
|
||||||
|
**NEW WAY (Webhooks):**
|
||||||
|
```python
|
||||||
|
# Submit job with webhook
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhook",
|
||||||
|
"webhook_data_in_payload": True
|
||||||
|
}
|
||||||
|
}
|
||||||
|
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
||||||
|
|
||||||
|
# Done! Webhook will notify you when complete
|
||||||
|
# Your webhook handler receives the results automatically
|
||||||
|
```
|
||||||
|
|
||||||
|
### Crawl Job Webhooks
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"browser_config": {"headless": true},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": false,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "your-secret-token"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### LLM Extraction Job Webhooks (NEW!)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/llm/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"url": "https://example.com/article",
|
||||||
|
"q": "Extract the article title, author, and publication date",
|
||||||
|
"schema": "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"}}}",
|
||||||
|
"provider": "openai/gpt-4o-mini",
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
||||||
|
"webhook_data_in_payload": true
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Webhook Payload Structure
|
||||||
|
|
||||||
|
**Success (with data):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "llm_1698765432",
|
||||||
|
"task_type": "llm_extraction",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com/article"],
|
||||||
|
"data": {
|
||||||
|
"extracted_content": {
|
||||||
|
"title": "Understanding Web Scraping",
|
||||||
|
"author": "John Doe",
|
||||||
|
"date": "2025-10-22"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Failure:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_abc123",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "failed",
|
||||||
|
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"error": "Connection timeout after 30s"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Simple Webhook Handler Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@app.route('/webhook', methods=['POST'])
|
||||||
|
def handle_webhook():
|
||||||
|
payload = request.json
|
||||||
|
|
||||||
|
task_id = payload['task_id']
|
||||||
|
task_type = payload['task_type']
|
||||||
|
status = payload['status']
|
||||||
|
|
||||||
|
if status == 'completed':
|
||||||
|
if 'data' in payload:
|
||||||
|
# Process data directly
|
||||||
|
data = payload['data']
|
||||||
|
else:
|
||||||
|
# Fetch from API
|
||||||
|
endpoint = 'crawl' if task_type == 'crawl' else 'llm'
|
||||||
|
response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
# Your business logic here
|
||||||
|
print(f"Job {task_id} completed!")
|
||||||
|
|
||||||
|
elif status == 'failed':
|
||||||
|
error = payload.get('error', 'Unknown error')
|
||||||
|
print(f"Job {task_id} failed: {error}")
|
||||||
|
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
app.run(port=8080)
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📊 Performance Improvements
|
||||||
|
|
||||||
|
- **Reduced Server Load**: Eliminates constant polling requests
|
||||||
|
- **Lower Latency**: Instant notification vs. polling interval delay
|
||||||
|
- **Better Resource Usage**: Frees up client connections while jobs run in background
|
||||||
|
- **Scalable Architecture**: Handles high-volume crawling workflows efficiently
|
||||||
|
|
||||||
|
## 🐛 Bug Fixes
|
||||||
|
|
||||||
|
- Fixed webhook configuration serialization for Pydantic HttpUrl fields
|
||||||
|
- Improved error handling in webhook delivery service
|
||||||
|
- Enhanced Redis task storage for webhook config persistence
|
||||||
|
|
||||||
|
## 🌍 Expected Real-World Impact
|
||||||
|
|
||||||
|
### For Web Scraping Workflows
|
||||||
|
- **Reduced Costs**: Less API calls = lower bandwidth and server costs
|
||||||
|
- **Better UX**: Instant notifications improve user experience
|
||||||
|
- **Scalability**: Handle 100s of concurrent jobs without polling overhead
|
||||||
|
|
||||||
|
### For LLM Extraction Pipelines
|
||||||
|
- **Async Processing**: Submit LLM extraction jobs and move on
|
||||||
|
- **Batch Processing**: Queue multiple extractions, get notified as they complete
|
||||||
|
- **Integration**: Easy integration with workflow automation tools (Zapier, n8n, etc.)
|
||||||
|
|
||||||
|
### For Microservices
|
||||||
|
- **Event-Driven**: Perfect for event-driven microservice architectures
|
||||||
|
- **Decoupling**: Decouple job submission from result processing
|
||||||
|
- **Reliability**: Automatic retries ensure webhooks are delivered
|
||||||
|
|
||||||
|
## 🔄 Breaking Changes
|
||||||
|
|
||||||
|
**None!** This release is fully backward compatible.
|
||||||
|
|
||||||
|
- Webhook configuration is optional
|
||||||
|
- Existing code continues to work without modification
|
||||||
|
- Polling is still supported for jobs without webhook config
|
||||||
|
|
||||||
|
## 📚 Documentation
|
||||||
|
|
||||||
|
### New Documentation
|
||||||
|
- **[WEBHOOK_EXAMPLES.md](../deploy/docker/WEBHOOK_EXAMPLES.md)** - Comprehensive webhook usage guide
|
||||||
|
- **[docker_webhook_example.py](../docs/examples/docker_webhook_example.py)** - Working code examples
|
||||||
|
|
||||||
|
### Updated Documentation
|
||||||
|
- **[Docker README](../deploy/docker/README.md)** - Added webhook sections
|
||||||
|
- API documentation with webhook examples
|
||||||
|
|
||||||
|
## 🛠️ Migration Guide
|
||||||
|
|
||||||
|
No migration needed! Webhooks are opt-in:
|
||||||
|
|
||||||
|
1. **To use webhooks**: Add `webhook_config` to your job payload
|
||||||
|
2. **To keep polling**: Continue using your existing code
|
||||||
|
|
||||||
|
### Quick Start
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Just add webhook_config to your existing payload
|
||||||
|
payload = {
|
||||||
|
# Your existing configuration
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"browser_config": {...},
|
||||||
|
"crawler_config": {...},
|
||||||
|
|
||||||
|
# NEW: Add webhook configuration
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhook",
|
||||||
|
"webhook_data_in_payload": True
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🔧 Configuration
|
||||||
|
|
||||||
|
### Global Webhook Configuration (config.yml)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
webhooks:
|
||||||
|
enabled: true
|
||||||
|
default_url: "https://myapp.com/webhooks/default" # Optional
|
||||||
|
data_in_payload: false
|
||||||
|
retry:
|
||||||
|
max_attempts: 5
|
||||||
|
initial_delay_ms: 1000
|
||||||
|
max_delay_ms: 32000
|
||||||
|
timeout_ms: 30000
|
||||||
|
headers:
|
||||||
|
User-Agent: "Crawl4AI-Webhook/1.0"
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🚀 Upgrade Instructions
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Pull the latest image
|
||||||
|
docker pull unclecode/crawl4ai:0.7.6
|
||||||
|
|
||||||
|
# Or use latest tag
|
||||||
|
docker pull unclecode/crawl4ai:latest
|
||||||
|
|
||||||
|
# Run with webhook support
|
||||||
|
docker run -d \
|
||||||
|
-p 11235:11235 \
|
||||||
|
--env-file .llm.env \
|
||||||
|
--name crawl4ai \
|
||||||
|
unclecode/crawl4ai:0.7.6
|
||||||
|
```
|
||||||
|
|
||||||
|
### Python Package
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install --upgrade crawl4ai
|
||||||
|
```
|
||||||
|
|
||||||
|
## 💡 Pro Tips
|
||||||
|
|
||||||
|
1. **Use notification-only mode** for large results - fetch data separately to avoid large webhook payloads
|
||||||
|
2. **Set custom headers** for webhook authentication and request tracking
|
||||||
|
3. **Configure global default webhook** for consistent handling across all jobs
|
||||||
|
4. **Implement idempotent webhook handlers** - same webhook may be delivered multiple times on retry
|
||||||
|
5. **Use structured schemas** with LLM extraction for predictable webhook data
|
||||||
|
|
||||||
|
## 🎬 Demo
|
||||||
|
|
||||||
|
Try the release demo:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python docs/releases_review/demo_v0.7.6.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This comprehensive demo showcases:
|
||||||
|
- Crawl job webhooks (notification-only and with data)
|
||||||
|
- LLM extraction webhooks (with JSON schema support)
|
||||||
|
- Custom headers for authentication
|
||||||
|
- Webhook retry mechanism
|
||||||
|
- Real-time webhook receiver
|
||||||
|
|
||||||
|
## 🙏 Acknowledgments
|
||||||
|
|
||||||
|
Thank you to the community for the feedback that shaped this feature! Special thanks to everyone who requested webhook support for asynchronous job processing.
|
||||||
|
|
||||||
|
## 📞 Support
|
||||||
|
|
||||||
|
- **Documentation**: https://docs.crawl4ai.com
|
||||||
|
- **GitHub Issues**: https://github.com/unclecode/crawl4ai/issues
|
||||||
|
- **Discord**: https://discord.gg/crawl4ai
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Happy crawling with webhooks!** 🕷️🪝
|
||||||
|
|
||||||
|
*- unclecode*
|
||||||
522
docs/examples/docker_client_hooks_example.py
Normal file
522
docs/examples/docker_client_hooks_example.py
Normal file
@@ -0,0 +1,522 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Comprehensive hooks examples using Docker Client with function objects.
|
||||||
|
|
||||||
|
This approach is recommended because:
|
||||||
|
- Write hooks as regular Python functions
|
||||||
|
- Full IDE support (autocomplete, type checking)
|
||||||
|
- Automatic conversion to API format
|
||||||
|
- Reusable and testable code
|
||||||
|
- Clean, readable syntax
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
# API_BASE_URL = "http://localhost:11235"
|
||||||
|
API_BASE_URL = "http://localhost:11234"
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Hook Function Definitions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# --- All Hooks Demo ---
|
||||||
|
async def browser_created_hook(browser, **kwargs):
|
||||||
|
"""Called after browser is created"""
|
||||||
|
print("[HOOK] Browser created and ready")
|
||||||
|
return browser
|
||||||
|
|
||||||
|
|
||||||
|
async def page_context_hook(page, context, **kwargs):
|
||||||
|
"""Setup page environment"""
|
||||||
|
print("[HOOK] Setting up page environment")
|
||||||
|
|
||||||
|
# Set viewport
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
|
||||||
|
# Add cookies
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "test_session",
|
||||||
|
"value": "abc123xyz",
|
||||||
|
"domain": ".httpbin.org",
|
||||||
|
"path": "/"
|
||||||
|
}])
|
||||||
|
|
||||||
|
# Block resources
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
|
||||||
|
await context.route("**/analytics/*", lambda route: route.abort())
|
||||||
|
|
||||||
|
print("[HOOK] Environment configured")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def user_agent_hook(page, context, user_agent, **kwargs):
|
||||||
|
"""Called when user agent is updated"""
|
||||||
|
print(f"[HOOK] User agent: {user_agent[:50]}...")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def before_goto_hook(page, context, url, **kwargs):
|
||||||
|
"""Called before navigating to URL"""
|
||||||
|
print(f"[HOOK] Navigating to: {url}")
|
||||||
|
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
"X-Custom-Header": "crawl4ai-test",
|
||||||
|
"Accept-Language": "en-US"
|
||||||
|
})
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def after_goto_hook(page, context, url, response, **kwargs):
|
||||||
|
"""Called after page loads"""
|
||||||
|
print(f"[HOOK] Page loaded: {url}")
|
||||||
|
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector("body", timeout=2000)
|
||||||
|
print("[HOOK] Body element ready")
|
||||||
|
except:
|
||||||
|
print("[HOOK] Timeout, continuing")
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def execution_started_hook(page, context, **kwargs):
|
||||||
|
"""Called when custom JS execution starts"""
|
||||||
|
print("[HOOK] JS execution started")
|
||||||
|
await page.evaluate("console.log('[HOOK] Custom JS');")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def before_retrieve_hook(page, context, **kwargs):
|
||||||
|
"""Called before retrieving HTML"""
|
||||||
|
print("[HOOK] Preparing HTML retrieval")
|
||||||
|
|
||||||
|
# Scroll for lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
await page.wait_for_timeout(500)
|
||||||
|
await page.evaluate("window.scrollTo(0, 0);")
|
||||||
|
|
||||||
|
print("[HOOK] Scrolling complete")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def before_return_hook(page, context, html, **kwargs):
|
||||||
|
"""Called before returning HTML"""
|
||||||
|
print(f"[HOOK] HTML ready: {len(html)} chars")
|
||||||
|
|
||||||
|
metrics = await page.evaluate('''() => ({
|
||||||
|
images: document.images.length,
|
||||||
|
links: document.links.length,
|
||||||
|
scripts: document.scripts.length
|
||||||
|
})''')
|
||||||
|
|
||||||
|
print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Authentication Hooks ---
|
||||||
|
async def auth_context_hook(page, context, **kwargs):
|
||||||
|
"""Setup authentication context"""
|
||||||
|
print("[HOOK] Setting up authentication")
|
||||||
|
|
||||||
|
# Add auth cookies
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "auth_token",
|
||||||
|
"value": "fake_jwt_token",
|
||||||
|
"domain": ".httpbin.org",
|
||||||
|
"path": "/",
|
||||||
|
"httpOnly": True
|
||||||
|
}])
|
||||||
|
|
||||||
|
# Set localStorage
|
||||||
|
await page.evaluate('''
|
||||||
|
localStorage.setItem('user_id', '12345');
|
||||||
|
localStorage.setItem('auth_time', new Date().toISOString());
|
||||||
|
''')
|
||||||
|
|
||||||
|
print("[HOOK] Auth context ready")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def auth_headers_hook(page, context, url, **kwargs):
|
||||||
|
"""Add authentication headers"""
|
||||||
|
print(f"[HOOK] Adding auth headers for {url}")
|
||||||
|
|
||||||
|
import base64
|
||||||
|
credentials = base64.b64encode(b"user:passwd").decode('ascii')
|
||||||
|
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'Authorization': f'Basic {credentials}',
|
||||||
|
'X-API-Key': 'test-key-123'
|
||||||
|
})
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Performance Optimization Hooks ---
|
||||||
|
async def performance_hook(page, context, **kwargs):
|
||||||
|
"""Optimize page for performance"""
|
||||||
|
print("[HOOK] Optimizing for performance")
|
||||||
|
|
||||||
|
# Block resource-heavy content
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort())
|
||||||
|
await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort())
|
||||||
|
await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort())
|
||||||
|
await context.route("**/googletagmanager.com/*", lambda r: r.abort())
|
||||||
|
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
||||||
|
await context.route("**/facebook.com/*", lambda r: r.abort())
|
||||||
|
|
||||||
|
# Disable animations
|
||||||
|
await page.add_style_tag(content='''
|
||||||
|
*, *::before, *::after {
|
||||||
|
animation-duration: 0s !important;
|
||||||
|
transition-duration: 0s !important;
|
||||||
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
|
print("[HOOK] Optimizations applied")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def cleanup_hook(page, context, **kwargs):
|
||||||
|
"""Clean page before extraction"""
|
||||||
|
print("[HOOK] Cleaning page")
|
||||||
|
|
||||||
|
await page.evaluate('''() => {
|
||||||
|
const selectors = [
|
||||||
|
'.ad', '.ads', '.advertisement',
|
||||||
|
'.popup', '.modal', '.overlay',
|
||||||
|
'.cookie-banner', '.newsletter'
|
||||||
|
];
|
||||||
|
|
||||||
|
selectors.forEach(sel => {
|
||||||
|
document.querySelectorAll(sel).forEach(el => el.remove());
|
||||||
|
});
|
||||||
|
|
||||||
|
document.querySelectorAll('script, style').forEach(el => el.remove());
|
||||||
|
}''')
|
||||||
|
|
||||||
|
print("[HOOK] Page cleaned")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Content Extraction Hooks ---
|
||||||
|
async def wait_dynamic_content_hook(page, context, url, response, **kwargs):
|
||||||
|
"""Wait for dynamic content to load"""
|
||||||
|
print(f"[HOOK] Waiting for dynamic content on {url}")
|
||||||
|
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
|
||||||
|
# Click "Load More" if exists
|
||||||
|
try:
|
||||||
|
load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")')
|
||||||
|
if load_more:
|
||||||
|
await load_more.click()
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
print("[HOOK] Clicked 'Load More'")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_metadata_hook(page, context, **kwargs):
|
||||||
|
"""Extract page metadata"""
|
||||||
|
print("[HOOK] Extracting metadata")
|
||||||
|
|
||||||
|
metadata = await page.evaluate('''() => {
|
||||||
|
const getMeta = (name) => {
|
||||||
|
const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
||||||
|
return el ? el.getAttribute('content') : null;
|
||||||
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
title: document.title,
|
||||||
|
description: getMeta('description'),
|
||||||
|
author: getMeta('author'),
|
||||||
|
keywords: getMeta('keywords'),
|
||||||
|
};
|
||||||
|
}''')
|
||||||
|
|
||||||
|
print(f"[HOOK] Metadata: {metadata}")
|
||||||
|
|
||||||
|
# Infinite scroll
|
||||||
|
for i in range(3):
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
print(f"[HOOK] Scroll {i+1}/3")
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Multi-URL Hooks ---
|
||||||
|
async def url_specific_hook(page, context, url, **kwargs):
|
||||||
|
"""Apply URL-specific logic"""
|
||||||
|
print(f"[HOOK] Processing URL: {url}")
|
||||||
|
|
||||||
|
# URL-specific headers
|
||||||
|
if 'html' in url:
|
||||||
|
await page.set_extra_http_headers({"X-Type": "HTML"})
|
||||||
|
elif 'json' in url:
|
||||||
|
await page.set_extra_http_headers({"X-Type": "JSON"})
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def track_progress_hook(page, context, url, response, **kwargs):
|
||||||
|
"""Track crawl progress"""
|
||||||
|
status = response.status if response else 'unknown'
|
||||||
|
print(f"[HOOK] Loaded {url} - Status: {status}")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Test Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def test_all_hooks_comprehensive():
|
||||||
|
"""Test all 8 hook types"""
|
||||||
|
print("=" * 70)
|
||||||
|
print("Test 1: All Hooks Comprehensive Demo (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nCrawling with all 8 hooks...")
|
||||||
|
|
||||||
|
# Define hooks with function objects
|
||||||
|
hooks = {
|
||||||
|
"on_browser_created": browser_created_hook,
|
||||||
|
"on_page_context_created": page_context_hook,
|
||||||
|
"on_user_agent_updated": user_agent_hook,
|
||||||
|
"before_goto": before_goto_hook,
|
||||||
|
"after_goto": after_goto_hook,
|
||||||
|
"on_execution_started": execution_started_hook,
|
||||||
|
"before_retrieve_html": before_retrieve_hook,
|
||||||
|
"before_return_html": before_return_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Success!")
|
||||||
|
print(f" URL: {result.url}")
|
||||||
|
print(f" Success: {result.success}")
|
||||||
|
print(f" HTML: {len(result.html)} chars")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_authentication_workflow():
|
||||||
|
"""Test authentication with hooks"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 2: Authentication Workflow (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nTesting authentication...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"on_page_context_created": auth_context_hook,
|
||||||
|
"before_goto": auth_headers_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/basic-auth/user/passwd"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=15
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Authentication completed")
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
if '"authenticated"' in result.html and 'true' in result.html:
|
||||||
|
print(" ✅ Basic auth successful!")
|
||||||
|
else:
|
||||||
|
print(" ⚠️ Auth status unclear")
|
||||||
|
else:
|
||||||
|
print(f" ❌ Failed: {result.error_message}")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_performance_optimization():
|
||||||
|
"""Test performance optimization"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 3: Performance Optimization (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nTesting performance hooks...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"on_page_context_created": performance_hook,
|
||||||
|
"before_retrieve_html": cleanup_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=10
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Optimization completed")
|
||||||
|
print(f" HTML size: {len(result.html):,} chars")
|
||||||
|
print(" Resources blocked, ads removed")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_content_extraction():
|
||||||
|
"""Test content extraction"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 4: Content Extraction (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nTesting extraction hooks...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"after_goto": wait_dynamic_content_hook,
|
||||||
|
"before_retrieve_html": extract_metadata_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://www.kidocode.com/"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=20
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Extraction completed")
|
||||||
|
print(f" URL: {result.url}")
|
||||||
|
print(f" Success: {result.success}")
|
||||||
|
print(f" Metadata: {result.metadata}")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_multi_url_crawl():
|
||||||
|
"""Test hooks with multiple URLs"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 5: Multi-URL Crawl (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nCrawling multiple URLs...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"before_goto": url_specific_hook,
|
||||||
|
"after_goto": track_progress_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
results = await client.crawl(
|
||||||
|
[
|
||||||
|
"https://httpbin.org/html",
|
||||||
|
"https://httpbin.org/json",
|
||||||
|
"https://httpbin.org/xml"
|
||||||
|
],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=15
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Multi-URL crawl completed")
|
||||||
|
print(f"\n Crawled {len(results)} URLs:")
|
||||||
|
for i, result in enumerate(results, 1):
|
||||||
|
status = "✅" if result.success else "❌"
|
||||||
|
print(f" {status} {i}. {result.url}")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_reusable_hook_library():
|
||||||
|
"""Test using reusable hook library"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 6: Reusable Hook Library (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Create a library of reusable hooks
|
||||||
|
class HookLibrary:
|
||||||
|
@staticmethod
|
||||||
|
async def block_images(page, context, **kwargs):
|
||||||
|
"""Block all images"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort())
|
||||||
|
print("[LIBRARY] Images blocked")
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def block_analytics(page, context, **kwargs):
|
||||||
|
"""Block analytics"""
|
||||||
|
await context.route("**/analytics/*", lambda r: r.abort())
|
||||||
|
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
||||||
|
print("[LIBRARY] Analytics blocked")
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def scroll_infinite(page, context, **kwargs):
|
||||||
|
"""Handle infinite scroll"""
|
||||||
|
for i in range(5):
|
||||||
|
prev = await page.evaluate("document.body.scrollHeight")
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
curr = await page.evaluate("document.body.scrollHeight")
|
||||||
|
if curr == prev:
|
||||||
|
break
|
||||||
|
print("[LIBRARY] Infinite scroll complete")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nUsing hook library...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"on_page_context_created": HookLibrary.block_images,
|
||||||
|
"before_retrieve_html": HookLibrary.scroll_infinite
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://www.kidocode.com/"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=20
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Library hooks completed")
|
||||||
|
print(f" Success: {result.success}")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Main
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all Docker client hook examples"""
|
||||||
|
print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)")
|
||||||
|
print("Using Python function objects with automatic conversion")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
("All Hooks Demo", test_all_hooks_comprehensive),
|
||||||
|
("Authentication", test_authentication_workflow),
|
||||||
|
("Performance", test_performance_optimization),
|
||||||
|
("Extraction", test_content_extraction),
|
||||||
|
("Multi-URL", test_multi_url_crawl),
|
||||||
|
("Hook Library", test_reusable_hook_library)
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, (name, test_func) in enumerate(tests, 1):
|
||||||
|
try:
|
||||||
|
await test_func()
|
||||||
|
print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("🎉 All Docker client hook examples completed!")
|
||||||
|
print("\n💡 Key Benefits of Function-Based Hooks:")
|
||||||
|
print(" • Write as regular Python functions")
|
||||||
|
print(" • Full IDE support (autocomplete, types)")
|
||||||
|
print(" • Automatic conversion to API format")
|
||||||
|
print(" • Reusable across projects")
|
||||||
|
print(" • Clean, readable code")
|
||||||
|
print(" • Easy to test and debug")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
File diff suppressed because it is too large
Load Diff
461
docs/examples/docker_webhook_example.py
Normal file
461
docs/examples/docker_webhook_example.py
Normal file
@@ -0,0 +1,461 @@
|
|||||||
|
"""
|
||||||
|
Docker Webhook Example for Crawl4AI
|
||||||
|
|
||||||
|
This example demonstrates how to use webhooks with the Crawl4AI job queue API.
|
||||||
|
Instead of polling for results, webhooks notify your application when jobs complete.
|
||||||
|
|
||||||
|
Supports both:
|
||||||
|
- /crawl/job - Raw crawling with markdown extraction
|
||||||
|
- /llm/job - LLM-powered content extraction
|
||||||
|
|
||||||
|
Prerequisites:
|
||||||
|
1. Crawl4AI Docker container running on localhost:11235
|
||||||
|
2. Flask installed: pip install flask requests
|
||||||
|
3. LLM API key configured in .llm.env (for LLM extraction examples)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
1. Run this script: python docker_webhook_example.py
|
||||||
|
2. The webhook server will start on http://localhost:8080
|
||||||
|
3. Jobs will be submitted and webhooks will be received automatically
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CRAWL4AI_BASE_URL = "http://localhost:11235"
|
||||||
|
WEBHOOK_BASE_URL = "http://localhost:8080" # Your webhook receiver URL
|
||||||
|
|
||||||
|
# Initialize Flask app for webhook receiver
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
# Store received webhook data for demonstration
|
||||||
|
received_webhooks = []
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/webhooks/crawl-complete', methods=['POST'])
|
||||||
|
def handle_crawl_webhook():
|
||||||
|
"""
|
||||||
|
Webhook handler that receives notifications when crawl jobs complete.
|
||||||
|
|
||||||
|
Payload structure:
|
||||||
|
{
|
||||||
|
"task_id": "crawl_abc123",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed" or "failed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"error": "error message" (only if failed),
|
||||||
|
"data": {...} (only if webhook_data_in_payload=True)
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
payload = request.json
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"📬 Webhook received for task: {payload['task_id']}")
|
||||||
|
print(f" Status: {payload['status']}")
|
||||||
|
print(f" Timestamp: {payload['timestamp']}")
|
||||||
|
print(f" URLs: {payload['urls']}")
|
||||||
|
|
||||||
|
if payload['status'] == 'completed':
|
||||||
|
# If data is in payload, process it directly
|
||||||
|
if 'data' in payload:
|
||||||
|
print(f" ✅ Data included in webhook")
|
||||||
|
data = payload['data']
|
||||||
|
# Process the crawl results here
|
||||||
|
for result in data.get('results', []):
|
||||||
|
print(f" - Crawled: {result.get('url')}")
|
||||||
|
print(f" - Markdown length: {len(result.get('markdown', ''))}")
|
||||||
|
else:
|
||||||
|
# Fetch results from API if not included
|
||||||
|
print(f" 📥 Fetching results from API...")
|
||||||
|
task_id = payload['task_id']
|
||||||
|
result_response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
|
||||||
|
if result_response.ok:
|
||||||
|
data = result_response.json()
|
||||||
|
print(f" ✅ Results fetched successfully")
|
||||||
|
# Process the crawl results here
|
||||||
|
for result in data['result'].get('results', []):
|
||||||
|
print(f" - Crawled: {result.get('url')}")
|
||||||
|
print(f" - Markdown length: {len(result.get('markdown', ''))}")
|
||||||
|
|
||||||
|
elif payload['status'] == 'failed':
|
||||||
|
print(f" ❌ Job failed: {payload.get('error', 'Unknown error')}")
|
||||||
|
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# Store webhook for demonstration
|
||||||
|
received_webhooks.append(payload)
|
||||||
|
|
||||||
|
# Return 200 OK to acknowledge receipt
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/webhooks/llm-complete', methods=['POST'])
|
||||||
|
def handle_llm_webhook():
|
||||||
|
"""
|
||||||
|
Webhook handler that receives notifications when LLM extraction jobs complete.
|
||||||
|
|
||||||
|
Payload structure:
|
||||||
|
{
|
||||||
|
"task_id": "llm_1698765432_12345",
|
||||||
|
"task_type": "llm_extraction",
|
||||||
|
"status": "completed" or "failed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com/article"],
|
||||||
|
"error": "error message" (only if failed),
|
||||||
|
"data": {"extracted_content": {...}} (only if webhook_data_in_payload=True)
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
payload = request.json
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"🤖 LLM Webhook received for task: {payload['task_id']}")
|
||||||
|
print(f" Task Type: {payload['task_type']}")
|
||||||
|
print(f" Status: {payload['status']}")
|
||||||
|
print(f" Timestamp: {payload['timestamp']}")
|
||||||
|
print(f" URL: {payload['urls'][0]}")
|
||||||
|
|
||||||
|
if payload['status'] == 'completed':
|
||||||
|
# If data is in payload, process it directly
|
||||||
|
if 'data' in payload:
|
||||||
|
print(f" ✅ Data included in webhook")
|
||||||
|
data = payload['data']
|
||||||
|
# Webhook wraps extracted content in 'extracted_content' field
|
||||||
|
extracted = data.get('extracted_content', {})
|
||||||
|
print(f" - Extracted content:")
|
||||||
|
print(f" {json.dumps(extracted, indent=8)}")
|
||||||
|
else:
|
||||||
|
# Fetch results from API if not included
|
||||||
|
print(f" 📥 Fetching results from API...")
|
||||||
|
task_id = payload['task_id']
|
||||||
|
result_response = requests.get(f"{CRAWL4AI_BASE_URL}/llm/job/{task_id}")
|
||||||
|
if result_response.ok:
|
||||||
|
data = result_response.json()
|
||||||
|
print(f" ✅ Results fetched successfully")
|
||||||
|
# API returns unwrapped content in 'result' field
|
||||||
|
extracted = data['result']
|
||||||
|
print(f" - Extracted content:")
|
||||||
|
print(f" {json.dumps(extracted, indent=8)}")
|
||||||
|
|
||||||
|
elif payload['status'] == 'failed':
|
||||||
|
print(f" ❌ Job failed: {payload.get('error', 'Unknown error')}")
|
||||||
|
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# Store webhook for demonstration
|
||||||
|
received_webhooks.append(payload)
|
||||||
|
|
||||||
|
# Return 200 OK to acknowledge receipt
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
|
||||||
|
def start_webhook_server():
|
||||||
|
"""Start the Flask webhook server in a separate thread"""
|
||||||
|
app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
|
||||||
|
|
||||||
|
|
||||||
|
def submit_crawl_job_with_webhook(urls, webhook_url, include_data=False):
|
||||||
|
"""
|
||||||
|
Submit a crawl job with webhook notification.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: List of URLs to crawl
|
||||||
|
webhook_url: URL to receive webhook notifications
|
||||||
|
include_data: Whether to include full results in webhook payload
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
task_id: The job's task identifier
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"urls": urls,
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": webhook_url,
|
||||||
|
"webhook_data_in_payload": include_data,
|
||||||
|
# Optional: Add custom headers for authentication
|
||||||
|
# "webhook_headers": {
|
||||||
|
# "X-Webhook-Secret": "your-secret-token"
|
||||||
|
# }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"\n🚀 Submitting crawl job...")
|
||||||
|
print(f" URLs: {urls}")
|
||||||
|
print(f" Webhook: {webhook_url}")
|
||||||
|
print(f" Include data: {include_data}")
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{CRAWL4AI_BASE_URL}/crawl/job",
|
||||||
|
json=payload,
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.ok:
|
||||||
|
data = response.json()
|
||||||
|
task_id = data['task_id']
|
||||||
|
print(f" ✅ Job submitted successfully")
|
||||||
|
print(f" Task ID: {task_id}")
|
||||||
|
return task_id
|
||||||
|
else:
|
||||||
|
print(f" ❌ Failed to submit job: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def submit_llm_job_with_webhook(url, query, webhook_url, include_data=False, schema=None, provider=None):
|
||||||
|
"""
|
||||||
|
Submit an LLM extraction job with webhook notification.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to extract content from
|
||||||
|
query: Instruction for the LLM (e.g., "Extract article title and author")
|
||||||
|
webhook_url: URL to receive webhook notifications
|
||||||
|
include_data: Whether to include full results in webhook payload
|
||||||
|
schema: Optional JSON schema for structured extraction
|
||||||
|
provider: Optional LLM provider (e.g., "openai/gpt-4o-mini")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
task_id: The job's task identifier
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"url": url,
|
||||||
|
"q": query,
|
||||||
|
"cache": False,
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": webhook_url,
|
||||||
|
"webhook_data_in_payload": include_data,
|
||||||
|
# Optional: Add custom headers for authentication
|
||||||
|
# "webhook_headers": {
|
||||||
|
# "X-Webhook-Secret": "your-secret-token"
|
||||||
|
# }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if schema:
|
||||||
|
payload["schema"] = schema
|
||||||
|
|
||||||
|
if provider:
|
||||||
|
payload["provider"] = provider
|
||||||
|
|
||||||
|
print(f"\n🤖 Submitting LLM extraction job...")
|
||||||
|
print(f" URL: {url}")
|
||||||
|
print(f" Query: {query}")
|
||||||
|
print(f" Webhook: {webhook_url}")
|
||||||
|
print(f" Include data: {include_data}")
|
||||||
|
if provider:
|
||||||
|
print(f" Provider: {provider}")
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{CRAWL4AI_BASE_URL}/llm/job",
|
||||||
|
json=payload,
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.ok:
|
||||||
|
data = response.json()
|
||||||
|
task_id = data['task_id']
|
||||||
|
print(f" ✅ Job submitted successfully")
|
||||||
|
print(f" Task ID: {task_id}")
|
||||||
|
return task_id
|
||||||
|
else:
|
||||||
|
print(f" ❌ Failed to submit job: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def submit_job_without_webhook(urls):
|
||||||
|
"""
|
||||||
|
Submit a job without webhook (traditional polling approach).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: List of URLs to crawl
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
task_id: The job's task identifier
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"urls": urls,
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"}
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"\n🚀 Submitting crawl job (without webhook)...")
|
||||||
|
print(f" URLs: {urls}")
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{CRAWL4AI_BASE_URL}/crawl/job",
|
||||||
|
json=payload
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.ok:
|
||||||
|
data = response.json()
|
||||||
|
task_id = data['task_id']
|
||||||
|
print(f" ✅ Job submitted successfully")
|
||||||
|
print(f" Task ID: {task_id}")
|
||||||
|
return task_id
|
||||||
|
else:
|
||||||
|
print(f" ❌ Failed to submit job: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def poll_job_status(task_id, timeout=60):
|
||||||
|
"""
|
||||||
|
Poll for job status (used when webhook is not configured).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task_id: The job's task identifier
|
||||||
|
timeout: Maximum time to wait in seconds
|
||||||
|
"""
|
||||||
|
print(f"\n⏳ Polling for job status...")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
while time.time() - start_time < timeout:
|
||||||
|
response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
|
||||||
|
|
||||||
|
if response.ok:
|
||||||
|
data = response.json()
|
||||||
|
status = data.get('status', 'unknown')
|
||||||
|
|
||||||
|
if status == 'completed':
|
||||||
|
print(f" ✅ Job completed!")
|
||||||
|
return data
|
||||||
|
elif status == 'failed':
|
||||||
|
print(f" ❌ Job failed: {data.get('error', 'Unknown error')}")
|
||||||
|
return data
|
||||||
|
else:
|
||||||
|
print(f" ⏳ Status: {status}, waiting...")
|
||||||
|
time.sleep(2)
|
||||||
|
else:
|
||||||
|
print(f" ❌ Failed to get status: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f" ⏰ Timeout reached")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run the webhook demonstration"""
|
||||||
|
|
||||||
|
# Check if Crawl4AI is running
|
||||||
|
try:
|
||||||
|
health = requests.get(f"{CRAWL4AI_BASE_URL}/health", timeout=5)
|
||||||
|
print(f"✅ Crawl4AI is running: {health.json()}")
|
||||||
|
except:
|
||||||
|
print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}")
|
||||||
|
print(" Please make sure Docker container is running:")
|
||||||
|
print(" docker run -d -p 11235:11235 --name crawl4ai unclecode/crawl4ai:latest")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Start webhook server in background thread
|
||||||
|
print(f"\n🌐 Starting webhook server at {WEBHOOK_BASE_URL}...")
|
||||||
|
webhook_thread = Thread(target=start_webhook_server, daemon=True)
|
||||||
|
webhook_thread.start()
|
||||||
|
time.sleep(2) # Give server time to start
|
||||||
|
|
||||||
|
# Example 1: Job with webhook (notification only, fetch data separately)
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Example 1: Webhook Notification Only")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
task_id_1 = submit_crawl_job_with_webhook(
|
||||||
|
urls=["https://example.com"],
|
||||||
|
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete",
|
||||||
|
include_data=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example 2: Job with webhook (data included in payload)
|
||||||
|
time.sleep(5) # Wait a bit between requests
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Example 2: Webhook with Full Data")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
task_id_2 = submit_crawl_job_with_webhook(
|
||||||
|
urls=["https://www.python.org"],
|
||||||
|
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete",
|
||||||
|
include_data=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example 3: LLM extraction with webhook (notification only)
|
||||||
|
time.sleep(5) # Wait a bit between requests
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Example 3: LLM Extraction with Webhook (Notification Only)")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
task_id_3 = submit_llm_job_with_webhook(
|
||||||
|
url="https://www.example.com",
|
||||||
|
query="Extract the main heading and description from this page.",
|
||||||
|
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
|
||||||
|
include_data=False,
|
||||||
|
provider="openai/gpt-4o-mini"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example 4: LLM extraction with webhook (data included + schema)
|
||||||
|
time.sleep(5) # Wait a bit between requests
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Example 4: LLM Extraction with Schema and Full Data")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Define a schema for structured extraction
|
||||||
|
schema = json.dumps({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {"type": "string", "description": "Page title"},
|
||||||
|
"description": {"type": "string", "description": "Page description"}
|
||||||
|
},
|
||||||
|
"required": ["title"]
|
||||||
|
})
|
||||||
|
|
||||||
|
task_id_4 = submit_llm_job_with_webhook(
|
||||||
|
url="https://www.python.org",
|
||||||
|
query="Extract the title and description of this website",
|
||||||
|
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
|
||||||
|
include_data=True,
|
||||||
|
schema=schema,
|
||||||
|
provider="openai/gpt-4o-mini"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example 5: Traditional polling (no webhook)
|
||||||
|
time.sleep(5) # Wait a bit between requests
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Example 5: Traditional Polling (No Webhook)")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
task_id_5 = submit_job_without_webhook(
|
||||||
|
urls=["https://github.com"]
|
||||||
|
)
|
||||||
|
if task_id_5:
|
||||||
|
result = poll_job_status(task_id_5)
|
||||||
|
if result and result.get('status') == 'completed':
|
||||||
|
print(f" ✅ Results retrieved via polling")
|
||||||
|
|
||||||
|
# Wait for webhooks to arrive
|
||||||
|
print(f"\n⏳ Waiting for webhooks to be received...")
|
||||||
|
time.sleep(30) # Give jobs time to complete and webhooks to arrive (longer for LLM)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Summary")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Total webhooks received: {len(received_webhooks)}")
|
||||||
|
|
||||||
|
crawl_webhooks = [w for w in received_webhooks if w['task_type'] == 'crawl']
|
||||||
|
llm_webhooks = [w for w in received_webhooks if w['task_type'] == 'llm_extraction']
|
||||||
|
|
||||||
|
print(f"\n📊 Breakdown:")
|
||||||
|
print(f" - Crawl webhooks: {len(crawl_webhooks)}")
|
||||||
|
print(f" - LLM extraction webhooks: {len(llm_webhooks)}")
|
||||||
|
|
||||||
|
print(f"\n📋 Details:")
|
||||||
|
for i, webhook in enumerate(received_webhooks, 1):
|
||||||
|
task_type = webhook['task_type']
|
||||||
|
icon = "🕷️" if task_type == "crawl" else "🤖"
|
||||||
|
print(f"{i}. {icon} Task {webhook['task_id']}: {webhook['status']} ({task_type})")
|
||||||
|
|
||||||
|
print(f"\n✅ Demo completed!")
|
||||||
|
print(f"\n💡 Pro tips:")
|
||||||
|
print(f" - In production, your webhook URL should be publicly accessible")
|
||||||
|
print(f" (e.g., https://myapp.com/webhooks) or use ngrok for testing")
|
||||||
|
print(f" - Both /crawl/job and /llm/job support the same webhook configuration")
|
||||||
|
print(f" - Use webhook_data_in_payload=true to get results directly in the webhook")
|
||||||
|
print(f" - LLM jobs may take longer, adjust timeouts accordingly")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
BIN
docs/md_v2/assets/crawl4ai-skill.zip
Normal file
BIN
docs/md_v2/assets/crawl4ai-skill.zip
Normal file
Binary file not shown.
@@ -20,17 +20,43 @@ Ever wondered why your AI coding assistant struggles with your library despite c
|
|||||||
|
|
||||||
## Latest Release
|
## Latest Release
|
||||||
|
|
||||||
|
### [Crawl4AI v0.7.6 – The Webhook Infrastructure Update](../blog/release-v0.7.6.md)
|
||||||
|
*October 22, 2025*
|
||||||
|
|
||||||
|
Crawl4AI v0.7.6 introduces comprehensive webhook support for the Docker job queue API, bringing real-time notifications to both crawling and LLM extraction workflows. No more polling!
|
||||||
|
|
||||||
|
Key highlights:
|
||||||
|
- **🪝 Complete Webhook Support**: Real-time notifications for both `/crawl/job` and `/llm/job` endpoints
|
||||||
|
- **🔄 Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
||||||
|
- **🔐 Custom Authentication**: Add custom headers for webhook authentication
|
||||||
|
- **📊 Flexible Delivery**: Choose notification-only or include full data in payload
|
||||||
|
- **⚙️ Global Configuration**: Set default webhook URL in config.yml for all jobs
|
||||||
|
- **🎯 Zero Breaking Changes**: Fully backward compatible, webhooks are opt-in
|
||||||
|
|
||||||
|
[Read full release notes →](../blog/release-v0.7.6.md)
|
||||||
|
|
||||||
|
## Recent Releases
|
||||||
|
|
||||||
|
### [Crawl4AI v0.7.5 – The Docker Hooks & Security Update](../blog/release-v0.7.5.md)
|
||||||
|
*September 29, 2025*
|
||||||
|
|
||||||
|
Crawl4AI v0.7.5 introduces the powerful Docker Hooks System for complete pipeline customization, enhanced LLM integration with custom providers, HTTPS preservation for modern web security, and resolves multiple community-reported issues.
|
||||||
|
|
||||||
|
Key highlights:
|
||||||
|
- **🔧 Docker Hooks System**: Custom Python functions at 8 key pipeline points for unprecedented customization
|
||||||
|
- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration
|
||||||
|
- **🔒 HTTPS Preservation**: Secure internal link handling for modern web applications
|
||||||
|
- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance
|
||||||
|
- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration
|
||||||
|
|
||||||
|
[Read full release notes →](../blog/release-v0.7.5.md)
|
||||||
|
|
||||||
|
## Recent Releases
|
||||||
|
|
||||||
### [Crawl4AI v0.7.4 – The Intelligent Table Extraction & Performance Update](../blog/release-v0.7.4.md)
|
### [Crawl4AI v0.7.4 – The Intelligent Table Extraction & Performance Update](../blog/release-v0.7.4.md)
|
||||||
*August 17, 2025*
|
*August 17, 2025*
|
||||||
|
|
||||||
Crawl4AI v0.7.4 introduces revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes that make Crawl4AI more robust for production workloads.
|
Revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes.
|
||||||
|
|
||||||
Key highlights:
|
|
||||||
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables
|
|
||||||
- **⚡ Dispatcher Bug Fix**: Fixed sequential processing issue in arun_many for fast-completing tasks
|
|
||||||
- **🧹 Memory Management Refactor**: Streamlined memory utilities and better resource management
|
|
||||||
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation
|
|
||||||
- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution
|
|
||||||
|
|
||||||
[Read full release notes →](../blog/release-v0.7.4.md)
|
[Read full release notes →](../blog/release-v0.7.4.md)
|
||||||
|
|
||||||
|
|||||||
314
docs/md_v2/blog/releases/0.7.6.md
Normal file
314
docs/md_v2/blog/releases/0.7.6.md
Normal file
@@ -0,0 +1,314 @@
|
|||||||
|
# Crawl4AI v0.7.6 Release Notes
|
||||||
|
|
||||||
|
*Release Date: October 22, 2025*
|
||||||
|
|
||||||
|
I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows.
|
||||||
|
|
||||||
|
## 🎯 What's New
|
||||||
|
|
||||||
|
### Webhook Support for Docker Job Queue API
|
||||||
|
|
||||||
|
The headline feature of v0.7.6 is comprehensive webhook support for asynchronous job processing. No more constant polling to check if your jobs are done - get instant notifications when they complete!
|
||||||
|
|
||||||
|
**Key Capabilities:**
|
||||||
|
|
||||||
|
- ✅ **Universal Webhook Support**: Both `/crawl/job` and `/llm/job` endpoints now support webhooks
|
||||||
|
- ✅ **Flexible Delivery Modes**: Choose notification-only or include full data in the webhook payload
|
||||||
|
- ✅ **Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
||||||
|
- ✅ **Custom Authentication**: Add custom headers for webhook authentication
|
||||||
|
- ✅ **Global Configuration**: Set default webhook URL in `config.yml` for all jobs
|
||||||
|
- ✅ **Task Type Identification**: Distinguish between `crawl` and `llm_extraction` tasks
|
||||||
|
|
||||||
|
### How It Works
|
||||||
|
|
||||||
|
Instead of constantly checking job status:
|
||||||
|
|
||||||
|
**OLD WAY (Polling):**
|
||||||
|
```python
|
||||||
|
# Submit job
|
||||||
|
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
||||||
|
task_id = response.json()['task_id']
|
||||||
|
|
||||||
|
# Poll until complete
|
||||||
|
while True:
|
||||||
|
status = requests.get(f"http://localhost:11235/crawl/job/{task_id}")
|
||||||
|
if status.json()['status'] == 'completed':
|
||||||
|
break
|
||||||
|
time.sleep(5) # Wait and try again
|
||||||
|
```
|
||||||
|
|
||||||
|
**NEW WAY (Webhooks):**
|
||||||
|
```python
|
||||||
|
# Submit job with webhook
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhook",
|
||||||
|
"webhook_data_in_payload": True
|
||||||
|
}
|
||||||
|
}
|
||||||
|
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
||||||
|
|
||||||
|
# Done! Webhook will notify you when complete
|
||||||
|
# Your webhook handler receives the results automatically
|
||||||
|
```
|
||||||
|
|
||||||
|
### Crawl Job Webhooks
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"browser_config": {"headless": true},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": false,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "your-secret-token"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### LLM Extraction Job Webhooks (NEW!)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/llm/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"url": "https://example.com/article",
|
||||||
|
"q": "Extract the article title, author, and publication date",
|
||||||
|
"schema": "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"}}}",
|
||||||
|
"provider": "openai/gpt-4o-mini",
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
||||||
|
"webhook_data_in_payload": true
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Webhook Payload Structure
|
||||||
|
|
||||||
|
**Success (with data):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "llm_1698765432",
|
||||||
|
"task_type": "llm_extraction",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com/article"],
|
||||||
|
"data": {
|
||||||
|
"extracted_content": {
|
||||||
|
"title": "Understanding Web Scraping",
|
||||||
|
"author": "John Doe",
|
||||||
|
"date": "2025-10-22"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Failure:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_abc123",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "failed",
|
||||||
|
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"error": "Connection timeout after 30s"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Simple Webhook Handler Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@app.route('/webhook', methods=['POST'])
|
||||||
|
def handle_webhook():
|
||||||
|
payload = request.json
|
||||||
|
|
||||||
|
task_id = payload['task_id']
|
||||||
|
task_type = payload['task_type']
|
||||||
|
status = payload['status']
|
||||||
|
|
||||||
|
if status == 'completed':
|
||||||
|
if 'data' in payload:
|
||||||
|
# Process data directly
|
||||||
|
data = payload['data']
|
||||||
|
else:
|
||||||
|
# Fetch from API
|
||||||
|
endpoint = 'crawl' if task_type == 'crawl' else 'llm'
|
||||||
|
response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
# Your business logic here
|
||||||
|
print(f"Job {task_id} completed!")
|
||||||
|
|
||||||
|
elif status == 'failed':
|
||||||
|
error = payload.get('error', 'Unknown error')
|
||||||
|
print(f"Job {task_id} failed: {error}")
|
||||||
|
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
app.run(port=8080)
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📊 Performance Improvements
|
||||||
|
|
||||||
|
- **Reduced Server Load**: Eliminates constant polling requests
|
||||||
|
- **Lower Latency**: Instant notification vs. polling interval delay
|
||||||
|
- **Better Resource Usage**: Frees up client connections while jobs run in background
|
||||||
|
- **Scalable Architecture**: Handles high-volume crawling workflows efficiently
|
||||||
|
|
||||||
|
## 🐛 Bug Fixes
|
||||||
|
|
||||||
|
- Fixed webhook configuration serialization for Pydantic HttpUrl fields
|
||||||
|
- Improved error handling in webhook delivery service
|
||||||
|
- Enhanced Redis task storage for webhook config persistence
|
||||||
|
|
||||||
|
## 🌍 Expected Real-World Impact
|
||||||
|
|
||||||
|
### For Web Scraping Workflows
|
||||||
|
- **Reduced Costs**: Less API calls = lower bandwidth and server costs
|
||||||
|
- **Better UX**: Instant notifications improve user experience
|
||||||
|
- **Scalability**: Handle 100s of concurrent jobs without polling overhead
|
||||||
|
|
||||||
|
### For LLM Extraction Pipelines
|
||||||
|
- **Async Processing**: Submit LLM extraction jobs and move on
|
||||||
|
- **Batch Processing**: Queue multiple extractions, get notified as they complete
|
||||||
|
- **Integration**: Easy integration with workflow automation tools (Zapier, n8n, etc.)
|
||||||
|
|
||||||
|
### For Microservices
|
||||||
|
- **Event-Driven**: Perfect for event-driven microservice architectures
|
||||||
|
- **Decoupling**: Decouple job submission from result processing
|
||||||
|
- **Reliability**: Automatic retries ensure webhooks are delivered
|
||||||
|
|
||||||
|
## 🔄 Breaking Changes
|
||||||
|
|
||||||
|
**None!** This release is fully backward compatible.
|
||||||
|
|
||||||
|
- Webhook configuration is optional
|
||||||
|
- Existing code continues to work without modification
|
||||||
|
- Polling is still supported for jobs without webhook config
|
||||||
|
|
||||||
|
## 📚 Documentation
|
||||||
|
|
||||||
|
### New Documentation
|
||||||
|
- **[WEBHOOK_EXAMPLES.md](../deploy/docker/WEBHOOK_EXAMPLES.md)** - Comprehensive webhook usage guide
|
||||||
|
- **[docker_webhook_example.py](../docs/examples/docker_webhook_example.py)** - Working code examples
|
||||||
|
|
||||||
|
### Updated Documentation
|
||||||
|
- **[Docker README](../deploy/docker/README.md)** - Added webhook sections
|
||||||
|
- API documentation with webhook examples
|
||||||
|
|
||||||
|
## 🛠️ Migration Guide
|
||||||
|
|
||||||
|
No migration needed! Webhooks are opt-in:
|
||||||
|
|
||||||
|
1. **To use webhooks**: Add `webhook_config` to your job payload
|
||||||
|
2. **To keep polling**: Continue using your existing code
|
||||||
|
|
||||||
|
### Quick Start
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Just add webhook_config to your existing payload
|
||||||
|
payload = {
|
||||||
|
# Your existing configuration
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"browser_config": {...},
|
||||||
|
"crawler_config": {...},
|
||||||
|
|
||||||
|
# NEW: Add webhook configuration
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhook",
|
||||||
|
"webhook_data_in_payload": True
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🔧 Configuration
|
||||||
|
|
||||||
|
### Global Webhook Configuration (config.yml)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
webhooks:
|
||||||
|
enabled: true
|
||||||
|
default_url: "https://myapp.com/webhooks/default" # Optional
|
||||||
|
data_in_payload: false
|
||||||
|
retry:
|
||||||
|
max_attempts: 5
|
||||||
|
initial_delay_ms: 1000
|
||||||
|
max_delay_ms: 32000
|
||||||
|
timeout_ms: 30000
|
||||||
|
headers:
|
||||||
|
User-Agent: "Crawl4AI-Webhook/1.0"
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🚀 Upgrade Instructions
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Pull the latest image
|
||||||
|
docker pull unclecode/crawl4ai:0.7.6
|
||||||
|
|
||||||
|
# Or use latest tag
|
||||||
|
docker pull unclecode/crawl4ai:latest
|
||||||
|
|
||||||
|
# Run with webhook support
|
||||||
|
docker run -d \
|
||||||
|
-p 11235:11235 \
|
||||||
|
--env-file .llm.env \
|
||||||
|
--name crawl4ai \
|
||||||
|
unclecode/crawl4ai:0.7.6
|
||||||
|
```
|
||||||
|
|
||||||
|
### Python Package
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install --upgrade crawl4ai
|
||||||
|
```
|
||||||
|
|
||||||
|
## 💡 Pro Tips
|
||||||
|
|
||||||
|
1. **Use notification-only mode** for large results - fetch data separately to avoid large webhook payloads
|
||||||
|
2. **Set custom headers** for webhook authentication and request tracking
|
||||||
|
3. **Configure global default webhook** for consistent handling across all jobs
|
||||||
|
4. **Implement idempotent webhook handlers** - same webhook may be delivered multiple times on retry
|
||||||
|
5. **Use structured schemas** with LLM extraction for predictable webhook data
|
||||||
|
|
||||||
|
## 🎬 Demo
|
||||||
|
|
||||||
|
Try the release demo:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python docs/releases_review/demo_v0.7.6.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This comprehensive demo showcases:
|
||||||
|
- Crawl job webhooks (notification-only and with data)
|
||||||
|
- LLM extraction webhooks (with JSON schema support)
|
||||||
|
- Custom headers for authentication
|
||||||
|
- Webhook retry mechanism
|
||||||
|
- Real-time webhook receiver
|
||||||
|
|
||||||
|
## 🙏 Acknowledgments
|
||||||
|
|
||||||
|
Thank you to the community for the feedback that shaped this feature! Special thanks to everyone who requested webhook support for asynchronous job processing.
|
||||||
|
|
||||||
|
## 📞 Support
|
||||||
|
|
||||||
|
- **Documentation**: https://docs.crawl4ai.com
|
||||||
|
- **GitHub Issues**: https://github.com/unclecode/crawl4ai/issues
|
||||||
|
- **Discord**: https://discord.gg/crawl4ai
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Happy crawling with webhooks!** 🕷️🪝
|
||||||
|
|
||||||
|
*- unclecode*
|
||||||
318
docs/md_v2/blog/releases/v0.7.5.md
Normal file
318
docs/md_v2/blog/releases/v0.7.5.md
Normal file
@@ -0,0 +1,318 @@
|
|||||||
|
# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update
|
||||||
|
|
||||||
|
*September 29, 2025 • 8 min read*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements.
|
||||||
|
|
||||||
|
## 🎯 What's New at a Glance
|
||||||
|
|
||||||
|
- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API
|
||||||
|
- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion
|
||||||
|
- **Enhanced LLM Integration**: Custom providers with temperature control
|
||||||
|
- **HTTPS Preservation**: Secure internal link handling
|
||||||
|
- **Bug Fixes**: Resolved multiple community-reported issues
|
||||||
|
- **Improved Docker Error Handling**: Better debugging and reliability
|
||||||
|
|
||||||
|
## 🔧 Docker Hooks System: Pipeline Customization
|
||||||
|
|
||||||
|
Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline.
|
||||||
|
|
||||||
|
### Real Example: Authentication & Performance
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Real working hooks for httpbin.org
|
||||||
|
hooks_config = {
|
||||||
|
"on_page_context_created": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("Hook: Setting up page context")
|
||||||
|
# Block images to speed up crawling
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
print("Hook: Images blocked")
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_retrieve_html": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("Hook: Before retrieving HTML")
|
||||||
|
# Scroll to bottom to load lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
print("Hook: Scrolled to bottom")
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_goto": """
|
||||||
|
async def hook(page, context, url, **kwargs):
|
||||||
|
print(f"Hook: About to navigate to {url}")
|
||||||
|
# Add custom headers
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Test-Header': 'crawl4ai-hooks-test'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test with Docker API
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"hooks": {
|
||||||
|
"code": hooks_config,
|
||||||
|
"timeout": 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
if result.get('success'):
|
||||||
|
print("✅ Hooks executed successfully!")
|
||||||
|
print(f"Content length: {len(result.get('markdown', ''))} characters")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Available Hook Points:**
|
||||||
|
- `on_browser_created`: Browser setup
|
||||||
|
- `on_page_context_created`: Page context configuration
|
||||||
|
- `before_goto`: Pre-navigation setup
|
||||||
|
- `after_goto`: Post-navigation processing
|
||||||
|
- `on_user_agent_updated`: User agent changes
|
||||||
|
- `on_execution_started`: Crawl initialization
|
||||||
|
- `before_retrieve_html`: Pre-extraction processing
|
||||||
|
- `before_return_html`: Final HTML processing
|
||||||
|
|
||||||
|
### Function-Based Hooks API
|
||||||
|
|
||||||
|
Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion!
|
||||||
|
|
||||||
|
**Option 1: Using the `hooks_to_string()` Utility**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import hooks_to_string
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Define hooks as regular Python functions (with full IDE support!)
|
||||||
|
async def on_page_context_created(page, context, **kwargs):
|
||||||
|
"""Block images to speed up crawling"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_goto(page, context, url, **kwargs):
|
||||||
|
"""Add custom headers"""
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Crawl4AI': 'v0.7.5',
|
||||||
|
'X-Custom-Header': 'my-value'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Convert functions to strings
|
||||||
|
hooks_code = hooks_to_string({
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_goto": before_goto
|
||||||
|
})
|
||||||
|
|
||||||
|
# Use with REST API
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"hooks": {"code": hooks_code, "timeout": 30}
|
||||||
|
}
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Option 2: Docker Client with Automatic Conversion (Recommended!)**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
# Define hooks as functions (same as above)
|
||||||
|
async def on_page_context_created(page, context, **kwargs):
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_retrieve_html(page, context, **kwargs):
|
||||||
|
# Scroll to load lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Use Docker client - conversion happens automatically!
|
||||||
|
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||||
|
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=["https://httpbin.org/html"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_retrieve_html": before_retrieve_html
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if results and results.success:
|
||||||
|
print(f"✅ Hooks executed! HTML length: {len(results.html)}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits of Function-Based Hooks:**
|
||||||
|
- ✅ Full IDE support (autocomplete, syntax highlighting)
|
||||||
|
- ✅ Type checking and linting
|
||||||
|
- ✅ Easier to test and debug
|
||||||
|
- ✅ Reusable across projects
|
||||||
|
- ✅ Automatic conversion in Docker client
|
||||||
|
- ✅ No breaking changes - string hooks still work!
|
||||||
|
|
||||||
|
## 🤖 Enhanced LLM Integration
|
||||||
|
|
||||||
|
Enhanced LLM integration with custom providers, temperature control, and base URL configuration.
|
||||||
|
|
||||||
|
### Multi-Provider Support
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
|
|
||||||
|
# Test with different providers
|
||||||
|
async def test_llm_providers():
|
||||||
|
# OpenAI with custom temperature
|
||||||
|
openai_strategy = LLMExtractionStrategy(
|
||||||
|
provider="gemini/gemini-2.5-flash-lite",
|
||||||
|
api_token="your-api-token",
|
||||||
|
temperature=0.7, # New in v0.7.5
|
||||||
|
instruction="Summarize this page in one sentence"
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://example.com",
|
||||||
|
config=CrawlerRunConfig(extraction_strategy=openai_strategy)
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
print("✅ LLM extraction completed")
|
||||||
|
print(result.extracted_content)
|
||||||
|
|
||||||
|
# Docker API with enhanced LLM config
|
||||||
|
llm_payload = {
|
||||||
|
"url": "https://example.com",
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Summarize this page in one sentence.",
|
||||||
|
"provider": "gemini/gemini-2.5-flash-lite",
|
||||||
|
"temperature": 0.7
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("http://localhost:11235/md", json=llm_payload)
|
||||||
|
```
|
||||||
|
|
||||||
|
**New Features:**
|
||||||
|
- Custom `temperature` parameter for creativity control
|
||||||
|
- `base_url` for custom API endpoints
|
||||||
|
- Multi-provider environment variable support
|
||||||
|
- Docker API integration
|
||||||
|
|
||||||
|
## 🔒 HTTPS Preservation
|
||||||
|
|
||||||
|
**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear.
|
||||||
|
|
||||||
|
**Solution:** HTTPS preservation maintains secure protocols throughout crawling.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy
|
||||||
|
|
||||||
|
async def test_https_preservation():
|
||||||
|
# Enable HTTPS preservation
|
||||||
|
url_filter = URLPatternFilter(
|
||||||
|
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
||||||
|
)
|
||||||
|
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
exclude_external_links=True,
|
||||||
|
preserve_https_for_internal_links=True, # New in v0.7.5
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2,
|
||||||
|
max_pages=5,
|
||||||
|
filter_chain=FilterChain([url_filter])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
async for result in await crawler.arun(
|
||||||
|
url="https://quotes.toscrape.com",
|
||||||
|
config=config
|
||||||
|
):
|
||||||
|
# All internal links maintain HTTPS
|
||||||
|
internal_links = [link['href'] for link in result.links['internal']]
|
||||||
|
https_links = [link for link in internal_links if link.startswith('https://')]
|
||||||
|
|
||||||
|
print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}")
|
||||||
|
for link in https_links[:3]:
|
||||||
|
print(f" → {link}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🛠️ Bug Fixes and Improvements
|
||||||
|
|
||||||
|
### Major Fixes
|
||||||
|
- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332)
|
||||||
|
- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated)
|
||||||
|
- **Docker Error Handling**: Comprehensive error messages with status codes
|
||||||
|
- **Memory Management**: Fixed leaks in long-running sessions
|
||||||
|
- **JWT Authentication**: Fixed Docker JWT validation issues (#1442)
|
||||||
|
- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481)
|
||||||
|
- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505)
|
||||||
|
- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419)
|
||||||
|
- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291)
|
||||||
|
- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989)
|
||||||
|
|
||||||
|
### Community-Reported Issues Fixed
|
||||||
|
This release addresses multiple issues reported by the community through GitHub issues and Discord discussions:
|
||||||
|
- Fixed browser configuration reference errors
|
||||||
|
- Resolved dependency conflicts with cssselect
|
||||||
|
- Improved error messaging for failed authentications
|
||||||
|
- Enhanced compatibility with various proxy configurations
|
||||||
|
- Fixed edge cases in URL normalization
|
||||||
|
|
||||||
|
### Configuration Updates
|
||||||
|
```python
|
||||||
|
# Old proxy config (deprecated)
|
||||||
|
# browser_config = BrowserConfig(proxy="http://proxy:8080")
|
||||||
|
|
||||||
|
# New enhanced proxy config
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
proxy_config={
|
||||||
|
"server": "http://proxy:8080",
|
||||||
|
"username": "optional-user",
|
||||||
|
"password": "optional-pass"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🔄 Breaking Changes
|
||||||
|
|
||||||
|
1. **Python 3.10+ Required**: Upgrade from Python 3.9
|
||||||
|
2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure
|
||||||
|
3. **New Dependency**: Added `cssselect` for better CSS handling
|
||||||
|
|
||||||
|
## 🚀 Get Started
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install latest version
|
||||||
|
pip install crawl4ai==0.7.5
|
||||||
|
|
||||||
|
# Docker deployment
|
||||||
|
docker pull unclecode/crawl4ai:latest
|
||||||
|
docker run -p 11235:11235 unclecode/crawl4ai:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
**Try the Demo:**
|
||||||
|
```bash
|
||||||
|
# Run working examples
|
||||||
|
python docs/releases_review/demo_v0.7.5.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Resources:**
|
||||||
|
- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com)
|
||||||
|
- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||||
|
- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||||
|
- 🐦 Twitter: [@unclecode](https://x.com/unclecode)
|
||||||
|
|
||||||
|
Happy crawling! 🕷️
|
||||||
5196
docs/md_v2/complete-sdk-reference.md
Normal file
5196
docs/md_v2/complete-sdk-reference.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -6,18 +6,6 @@
|
|||||||
- [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
|
- [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
|
||||||
- [Option 2: Using Docker Compose](#option-2-using-docker-compose)
|
- [Option 2: Using Docker Compose](#option-2-using-docker-compose)
|
||||||
- [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
|
- [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
|
||||||
- [Dockerfile Parameters](#dockerfile-parameters)
|
|
||||||
- [Using the API](#using-the-api)
|
|
||||||
- [Playground Interface](#playground-interface)
|
|
||||||
- [Python SDK](#python-sdk)
|
|
||||||
- [Understanding Request Schema](#understanding-request-schema)
|
|
||||||
- [REST API Examples](#rest-api-examples)
|
|
||||||
- [Additional API Endpoints](#additional-api-endpoints)
|
|
||||||
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
|
||||||
- [Screenshot Endpoint](#screenshot-endpoint)
|
|
||||||
- [PDF Export Endpoint](#pdf-export-endpoint)
|
|
||||||
- [JavaScript Execution Endpoint](#javascript-execution-endpoint)
|
|
||||||
- [Library Context Endpoint](#library-context-endpoint)
|
|
||||||
- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
|
- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
|
||||||
- [What is MCP?](#what-is-mcp)
|
- [What is MCP?](#what-is-mcp)
|
||||||
- [Connecting via MCP](#connecting-via-mcp)
|
- [Connecting via MCP](#connecting-via-mcp)
|
||||||
@@ -25,9 +13,36 @@
|
|||||||
- [Available MCP Tools](#available-mcp-tools)
|
- [Available MCP Tools](#available-mcp-tools)
|
||||||
- [Testing MCP Connections](#testing-mcp-connections)
|
- [Testing MCP Connections](#testing-mcp-connections)
|
||||||
- [MCP Schemas](#mcp-schemas)
|
- [MCP Schemas](#mcp-schemas)
|
||||||
|
- [Additional API Endpoints](#additional-api-endpoints)
|
||||||
|
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
||||||
|
- [Screenshot Endpoint](#screenshot-endpoint)
|
||||||
|
- [PDF Export Endpoint](#pdf-export-endpoint)
|
||||||
|
- [JavaScript Execution Endpoint](#javascript-execution-endpoint)
|
||||||
|
- [User-Provided Hooks API](#user-provided-hooks-api)
|
||||||
|
- [Hook Information Endpoint](#hook-information-endpoint)
|
||||||
|
- [Available Hook Points](#available-hook-points)
|
||||||
|
- [Using Hooks in Requests](#using-hooks-in-requests)
|
||||||
|
- [Hook Examples with Real URLs](#hook-examples-with-real-urls)
|
||||||
|
- [Security Best Practices](#security-best-practices)
|
||||||
|
- [Hook Response Information](#hook-response-information)
|
||||||
|
- [Error Handling](#error-handling)
|
||||||
|
- [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python)
|
||||||
|
- [Job Queue & Webhook API](#job-queue-webhook-api)
|
||||||
|
- [Why Use the Job Queue API?](#why-use-the-job-queue-api)
|
||||||
|
- [Available Endpoints](#available-endpoints)
|
||||||
|
- [Webhook Configuration](#webhook-configuration)
|
||||||
|
- [Usage Examples](#usage-examples)
|
||||||
|
- [Webhook Best Practices](#webhook-best-practices)
|
||||||
|
- [Use Cases](#use-cases)
|
||||||
|
- [Troubleshooting](#troubleshooting)
|
||||||
|
- [Dockerfile Parameters](#dockerfile-parameters)
|
||||||
|
- [Using the API](#using-the-api)
|
||||||
|
- [Playground Interface](#playground-interface)
|
||||||
|
- [Python SDK](#python-sdk)
|
||||||
|
- [Understanding Request Schema](#understanding-request-schema)
|
||||||
|
- [REST API Examples](#rest-api-examples)
|
||||||
|
- [LLM Configuration Examples](#llm-configuration-examples)
|
||||||
- [Metrics & Monitoring](#metrics--monitoring)
|
- [Metrics & Monitoring](#metrics--monitoring)
|
||||||
- [Deployment Scenarios](#deployment-scenarios)
|
|
||||||
- [Complete Examples](#complete-examples)
|
|
||||||
- [Server Configuration](#server-configuration)
|
- [Server Configuration](#server-configuration)
|
||||||
- [Understanding config.yml](#understanding-configyml)
|
- [Understanding config.yml](#understanding-configyml)
|
||||||
- [JWT Authentication](#jwt-authentication)
|
- [JWT Authentication](#jwt-authentication)
|
||||||
@@ -58,13 +73,13 @@ Pull and run images directly from Docker Hub without building locally.
|
|||||||
|
|
||||||
#### 1. Pull the Image
|
#### 1. Pull the Image
|
||||||
|
|
||||||
Our latest release is `0.7.3`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
Our latest release is `0.7.6`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||||
|
|
||||||
> 💡 **Note**: The `latest` tag points to the stable `0.7.3` version.
|
> 💡 **Note**: The `latest` tag points to the stable `0.7.6` version.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Pull the latest version
|
# Pull the latest version
|
||||||
docker pull unclecode/crawl4ai:0.7.3
|
docker pull unclecode/crawl4ai:0.7.6
|
||||||
|
|
||||||
# Or pull using the latest tag
|
# Or pull using the latest tag
|
||||||
docker pull unclecode/crawl4ai:latest
|
docker pull unclecode/crawl4ai:latest
|
||||||
@@ -136,7 +151,7 @@ docker stop crawl4ai && docker rm crawl4ai
|
|||||||
#### Docker Hub Versioning Explained
|
#### Docker Hub Versioning Explained
|
||||||
|
|
||||||
* **Image Name:** `unclecode/crawl4ai`
|
* **Image Name:** `unclecode/crawl4ai`
|
||||||
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.7.3`)
|
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.7.6`)
|
||||||
* `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
|
* `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
|
||||||
* `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
|
* `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
|
||||||
* **`latest` Tag:** Points to the most recent stable version
|
* **`latest` Tag:** Points to the most recent stable version
|
||||||
@@ -832,6 +847,733 @@ else:
|
|||||||
|
|
||||||
> 💡 **Remember**: Always test your hooks on safe, known websites first before using them on production sites. Never crawl sites that you don't have permission to access or that might be malicious.
|
> 💡 **Remember**: Always test your hooks on safe, known websites first before using them on production sites. Never crawl sites that you don't have permission to access or that might be malicious.
|
||||||
|
|
||||||
|
### Hooks Utility: Function-Based Approach (Python)
|
||||||
|
|
||||||
|
For Python developers, Crawl4AI provides a more convenient way to work with hooks using the `hooks_to_string()` utility function and Docker client integration.
|
||||||
|
|
||||||
|
#### Why Use Function-Based Hooks?
|
||||||
|
|
||||||
|
**String-Based Approach (shown above)**:
|
||||||
|
```python
|
||||||
|
hooks_code = {
|
||||||
|
"on_page_context_created": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Function-Based Approach (recommended for Python)**:
|
||||||
|
```python
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
async def my_hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks={"on_page_context_created": my_hook}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits**:
|
||||||
|
- ✅ Write hooks as regular Python functions
|
||||||
|
- ✅ Full IDE support (autocomplete, syntax highlighting, type checking)
|
||||||
|
- ✅ Easy to test and debug
|
||||||
|
- ✅ Reusable hook libraries
|
||||||
|
- ✅ Automatic conversion to API format
|
||||||
|
|
||||||
|
#### Using the Hooks Utility
|
||||||
|
|
||||||
|
The `hooks_to_string()` utility converts Python function objects to the string format required by the API:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import hooks_to_string
|
||||||
|
|
||||||
|
# Define your hooks as functions
|
||||||
|
async def setup_hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "session",
|
||||||
|
"value": "token",
|
||||||
|
"domain": ".example.com"
|
||||||
|
}])
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def scroll_hook(page, context, **kwargs):
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Convert to string format
|
||||||
|
hooks_dict = {
|
||||||
|
"on_page_context_created": setup_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
}
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
# Now use with REST API or Docker client
|
||||||
|
# hooks_string contains the string representations
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Docker Client with Automatic Conversion
|
||||||
|
|
||||||
|
The Docker client automatically detects and converts function objects:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
async def auth_hook(page, context, **kwargs):
|
||||||
|
"""Add authentication cookies"""
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "auth_token",
|
||||||
|
"value": "your_token",
|
||||||
|
"domain": ".example.com"
|
||||||
|
}])
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def performance_hook(page, context, **kwargs):
|
||||||
|
"""Block unnecessary resources"""
|
||||||
|
await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
|
||||||
|
await context.route("**/analytics/*", lambda r: r.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
|
||||||
|
# Pass functions directly - automatic conversion!
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": performance_hook,
|
||||||
|
"before_goto": auth_hook
|
||||||
|
},
|
||||||
|
hooks_timeout=30 # Optional timeout in seconds (1-120)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Success: {result.success}")
|
||||||
|
print(f"HTML: {len(result.html)} chars")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Creating Reusable Hook Libraries
|
||||||
|
|
||||||
|
Build collections of reusable hooks:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# hooks_library.py
|
||||||
|
class CrawlHooks:
|
||||||
|
"""Reusable hook collection for common crawling tasks"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def block_images(page, context, **kwargs):
|
||||||
|
"""Block all images to speed up crawling"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda r: r.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def block_analytics(page, context, **kwargs):
|
||||||
|
"""Block analytics and tracking scripts"""
|
||||||
|
tracking_domains = [
|
||||||
|
"**/google-analytics.com/*",
|
||||||
|
"**/googletagmanager.com/*",
|
||||||
|
"**/facebook.com/tr/*",
|
||||||
|
"**/doubleclick.net/*"
|
||||||
|
]
|
||||||
|
for domain in tracking_domains:
|
||||||
|
await context.route(domain, lambda r: r.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def scroll_infinite(page, context, **kwargs):
|
||||||
|
"""Handle infinite scroll to load more content"""
|
||||||
|
previous_height = 0
|
||||||
|
for i in range(5): # Max 5 scrolls
|
||||||
|
current_height = await page.evaluate("document.body.scrollHeight")
|
||||||
|
if current_height == previous_height:
|
||||||
|
break
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
previous_height = current_height
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def wait_for_dynamic_content(page, context, url, response, **kwargs):
|
||||||
|
"""Wait for dynamic content to load"""
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
try:
|
||||||
|
# Click "Load More" if present
|
||||||
|
load_more = await page.query_selector('[class*="load-more"]')
|
||||||
|
if load_more:
|
||||||
|
await load_more.click()
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Use in your application
|
||||||
|
from hooks_library import CrawlHooks
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
async def crawl_with_optimizations(url):
|
||||||
|
async with Crawl4aiDockerClient() as client:
|
||||||
|
result = await client.crawl(
|
||||||
|
[url],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": CrawlHooks.block_images,
|
||||||
|
"before_retrieve_html": CrawlHooks.scroll_infinite
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Choosing the Right Approach
|
||||||
|
|
||||||
|
| Approach | Best For | IDE Support | Language |
|
||||||
|
|----------|----------|-------------|----------|
|
||||||
|
| **String-based** | Non-Python clients, REST APIs, other languages | ❌ None | Any |
|
||||||
|
| **Function-based** | Python applications, local development | ✅ Full | Python only |
|
||||||
|
| **Docker Client** | Python apps with automatic conversion | ✅ Full | Python only |
|
||||||
|
|
||||||
|
**Recommendation**:
|
||||||
|
- **Python applications**: Use Docker client with function objects (easiest)
|
||||||
|
- **Non-Python or REST API**: Use string-based hooks (most flexible)
|
||||||
|
- **Manual control**: Use `hooks_to_string()` utility (middle ground)
|
||||||
|
|
||||||
|
#### Complete Example with Function Hooks
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import Crawl4aiDockerClient, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
# Define hooks as regular Python functions
|
||||||
|
async def setup_environment(page, context, **kwargs):
|
||||||
|
"""Setup crawling environment"""
|
||||||
|
# Set viewport
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
|
||||||
|
# Block resources for speed
|
||||||
|
await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
|
||||||
|
|
||||||
|
# Add custom headers
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
"Accept-Language": "en-US",
|
||||||
|
"X-Custom-Header": "Crawl4AI"
|
||||||
|
})
|
||||||
|
|
||||||
|
print("[HOOK] Environment configured")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def extract_content(page, context, **kwargs):
|
||||||
|
"""Extract and prepare content"""
|
||||||
|
# Scroll to load lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
|
||||||
|
# Extract metadata
|
||||||
|
metadata = await page.evaluate('''() => ({
|
||||||
|
title: document.title,
|
||||||
|
links: document.links.length,
|
||||||
|
images: document.images.length
|
||||||
|
})''')
|
||||||
|
|
||||||
|
print(f"[HOOK] Page metadata: {metadata}")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
|
||||||
|
# Configure crawl
|
||||||
|
browser_config = BrowserConfig(headless=True)
|
||||||
|
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
# Crawl with hooks
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
browser_config=browser_config,
|
||||||
|
crawler_config=crawler_config,
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": setup_environment,
|
||||||
|
"before_retrieve_html": extract_content
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
print(f"✅ Crawl successful!")
|
||||||
|
print(f" URL: {result.url}")
|
||||||
|
print(f" HTML: {len(result.html)} chars")
|
||||||
|
print(f" Markdown: {len(result.markdown)} chars")
|
||||||
|
else:
|
||||||
|
print(f"❌ Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Additional Resources
|
||||||
|
|
||||||
|
- **Comprehensive Examples**: See `/docs/examples/hooks_docker_client_example.py` for Python function-based examples
|
||||||
|
- **REST API Examples**: See `/docs/examples/hooks_rest_api_example.py` for string-based examples
|
||||||
|
- **Comparison Guide**: See `/docs/examples/README_HOOKS.md` for detailed comparison
|
||||||
|
- **Utility Documentation**: See `/docs/hooks-utility-guide.md` for complete guide
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Job Queue & Webhook API
|
||||||
|
|
||||||
|
The Docker deployment includes a powerful asynchronous job queue system with webhook support for both crawling and LLM extraction tasks. Instead of waiting for long-running operations to complete, submit jobs and receive real-time notifications via webhooks when they finish.
|
||||||
|
|
||||||
|
### Why Use the Job Queue API?
|
||||||
|
|
||||||
|
**Traditional Synchronous API (`/crawl`):**
|
||||||
|
- Client waits for entire crawl to complete
|
||||||
|
- Timeout issues with long-running crawls
|
||||||
|
- Resource blocking during execution
|
||||||
|
- Constant polling required for status updates
|
||||||
|
|
||||||
|
**Asynchronous Job Queue API (`/crawl/job`, `/llm/job`):**
|
||||||
|
- ✅ Submit job and continue immediately
|
||||||
|
- ✅ No timeout concerns for long operations
|
||||||
|
- ✅ Real-time webhook notifications on completion
|
||||||
|
- ✅ Better resource utilization
|
||||||
|
- ✅ Perfect for batch processing
|
||||||
|
- ✅ Ideal for microservice architectures
|
||||||
|
|
||||||
|
### Available Endpoints
|
||||||
|
|
||||||
|
#### 1. Crawl Job Endpoint
|
||||||
|
|
||||||
|
```
|
||||||
|
POST /crawl/job
|
||||||
|
```
|
||||||
|
|
||||||
|
Submit an asynchronous crawl job with optional webhook notification.
|
||||||
|
|
||||||
|
**Request Body:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"cache_mode": "bypass",
|
||||||
|
"extraction_strategy": {
|
||||||
|
"type": "JsonCssExtractionStrategy",
|
||||||
|
"schema": {
|
||||||
|
"title": "h1",
|
||||||
|
"content": ".article-body"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://your-app.com/webhook/crawl-complete",
|
||||||
|
"webhook_data_in_payload": true,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "your-secret-token",
|
||||||
|
"X-Custom-Header": "value"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_1698765432",
|
||||||
|
"message": "Crawl job submitted"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. LLM Extraction Job Endpoint
|
||||||
|
|
||||||
|
```
|
||||||
|
POST /llm/job
|
||||||
|
```
|
||||||
|
|
||||||
|
Submit an asynchronous LLM extraction job with optional webhook notification.
|
||||||
|
|
||||||
|
**Request Body:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"url": "https://example.com/article",
|
||||||
|
"q": "Extract the article title, author, publication date, and main points",
|
||||||
|
"provider": "openai/gpt-4o-mini",
|
||||||
|
"schema": "{\"title\": \"string\", \"author\": \"string\", \"date\": \"string\", \"points\": [\"string\"]}",
|
||||||
|
"cache": false,
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://your-app.com/webhook/llm-complete",
|
||||||
|
"webhook_data_in_payload": true,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "your-secret-token"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "llm_1698765432",
|
||||||
|
"message": "LLM job submitted"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Job Status Endpoint
|
||||||
|
|
||||||
|
```
|
||||||
|
GET /job/{task_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
Check the status and retrieve results of a submitted job.
|
||||||
|
|
||||||
|
**Response (In Progress):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_1698765432",
|
||||||
|
"status": "processing",
|
||||||
|
"message": "Job is being processed"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response (Completed):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_1698765432",
|
||||||
|
"status": "completed",
|
||||||
|
"result": {
|
||||||
|
"markdown": "# Page Title\n\nContent...",
|
||||||
|
"extracted_content": {...},
|
||||||
|
"links": {...}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Webhook Configuration
|
||||||
|
|
||||||
|
Webhooks provide real-time notifications when your jobs complete, eliminating the need for constant polling.
|
||||||
|
|
||||||
|
#### Webhook Config Parameters
|
||||||
|
|
||||||
|
| Parameter | Type | Required | Description |
|
||||||
|
|-----------|------|----------|-------------|
|
||||||
|
| `webhook_url` | string | Yes | Your HTTP(S) endpoint to receive notifications |
|
||||||
|
| `webhook_data_in_payload` | boolean | No | Include full result data in webhook payload (default: false) |
|
||||||
|
| `webhook_headers` | object | No | Custom headers for authentication/identification |
|
||||||
|
|
||||||
|
#### Webhook Payload Format
|
||||||
|
|
||||||
|
**Success Notification (Crawl Job):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_1698765432",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"data": {
|
||||||
|
"markdown": "# Page content...",
|
||||||
|
"extracted_content": {...},
|
||||||
|
"links": {...}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Success Notification (LLM Job):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "llm_1698765432",
|
||||||
|
"task_type": "llm_extraction",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com/article"],
|
||||||
|
"data": {
|
||||||
|
"extracted_content": {
|
||||||
|
"title": "Understanding Web Scraping",
|
||||||
|
"author": "John Doe",
|
||||||
|
"date": "2025-10-22",
|
||||||
|
"points": ["Point 1", "Point 2"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Failure Notification:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_1698765432",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "failed",
|
||||||
|
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"error": "Connection timeout after 30 seconds"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Webhook Delivery & Retry
|
||||||
|
|
||||||
|
- **Delivery Method:** HTTP POST to your `webhook_url`
|
||||||
|
- **Content-Type:** `application/json`
|
||||||
|
- **Retry Policy:** Exponential backoff with 5 attempts
|
||||||
|
- Attempt 1: Immediate
|
||||||
|
- Attempt 2: 1 second delay
|
||||||
|
- Attempt 3: 2 seconds delay
|
||||||
|
- Attempt 4: 4 seconds delay
|
||||||
|
- Attempt 5: 8 seconds delay
|
||||||
|
- **Success Status Codes:** 200-299
|
||||||
|
- **Custom Headers:** Your `webhook_headers` are included in every request
|
||||||
|
|
||||||
|
### Usage Examples
|
||||||
|
|
||||||
|
#### Example 1: Python with Webhook Handler (Flask)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
import requests
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
# Webhook handler
|
||||||
|
@app.route('/webhook/crawl-complete', methods=['POST'])
|
||||||
|
def handle_crawl_webhook():
|
||||||
|
payload = request.json
|
||||||
|
|
||||||
|
if payload['status'] == 'completed':
|
||||||
|
print(f"✅ Job {payload['task_id']} completed!")
|
||||||
|
print(f"Task type: {payload['task_type']}")
|
||||||
|
|
||||||
|
# Access the crawl results
|
||||||
|
if 'data' in payload:
|
||||||
|
markdown = payload['data'].get('markdown', '')
|
||||||
|
extracted = payload['data'].get('extracted_content', {})
|
||||||
|
print(f"Extracted {len(markdown)} characters")
|
||||||
|
print(f"Structured data: {extracted}")
|
||||||
|
else:
|
||||||
|
print(f"❌ Job {payload['task_id']} failed: {payload.get('error')}")
|
||||||
|
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
# Submit a crawl job with webhook
|
||||||
|
def submit_crawl_job():
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/crawl/job",
|
||||||
|
json={
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"extraction_strategy": {
|
||||||
|
"type": "JsonCssExtractionStrategy",
|
||||||
|
"schema": {
|
||||||
|
"name": "Example Schema",
|
||||||
|
"baseSelector": "body",
|
||||||
|
"fields": [
|
||||||
|
{"name": "title", "selector": "h1", "type": "text"},
|
||||||
|
{"name": "description", "selector": "meta[name='description']", "type": "attribute", "attribute": "content"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://your-app.com/webhook/crawl-complete",
|
||||||
|
"webhook_data_in_payload": True,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "your-secret-token"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
task_id = response.json()['task_id']
|
||||||
|
print(f"Job submitted: {task_id}")
|
||||||
|
return task_id
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(port=5000)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Example 2: LLM Extraction with Webhooks
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
def submit_llm_job_with_webhook():
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/llm/job",
|
||||||
|
json={
|
||||||
|
"url": "https://example.com/article",
|
||||||
|
"q": "Extract the article title, author, and main points",
|
||||||
|
"provider": "openai/gpt-4o-mini",
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://your-app.com/webhook/llm-complete",
|
||||||
|
"webhook_data_in_payload": True,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "your-secret-token"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
task_id = response.json()['task_id']
|
||||||
|
print(f"LLM job submitted: {task_id}")
|
||||||
|
return task_id
|
||||||
|
|
||||||
|
# Webhook handler for LLM jobs
|
||||||
|
@app.route('/webhook/llm-complete', methods=['POST'])
|
||||||
|
def handle_llm_webhook():
|
||||||
|
payload = request.json
|
||||||
|
|
||||||
|
if payload['status'] == 'completed':
|
||||||
|
extracted = payload['data']['extracted_content']
|
||||||
|
print(f"✅ LLM extraction completed!")
|
||||||
|
print(f"Results: {extracted}")
|
||||||
|
else:
|
||||||
|
print(f"❌ LLM extraction failed: {payload.get('error')}")
|
||||||
|
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Example 3: Without Webhooks (Polling)
|
||||||
|
|
||||||
|
If you don't use webhooks, you can poll for results:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Submit job
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/crawl/job",
|
||||||
|
json={"urls": ["https://example.com"]}
|
||||||
|
)
|
||||||
|
task_id = response.json()['task_id']
|
||||||
|
|
||||||
|
# Poll for results
|
||||||
|
while True:
|
||||||
|
result = requests.get(f"http://localhost:11235/job/{task_id}")
|
||||||
|
data = result.json()
|
||||||
|
|
||||||
|
if data['status'] == 'completed':
|
||||||
|
print("Job completed!")
|
||||||
|
print(data['result'])
|
||||||
|
break
|
||||||
|
elif data['status'] == 'failed':
|
||||||
|
print(f"Job failed: {data.get('error')}")
|
||||||
|
break
|
||||||
|
|
||||||
|
print("Still processing...")
|
||||||
|
time.sleep(2)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Example 4: Global Webhook Configuration
|
||||||
|
|
||||||
|
Set a default webhook URL in your `config.yml` to avoid repeating it in every request:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# config.yml
|
||||||
|
api:
|
||||||
|
crawler:
|
||||||
|
# ... other settings ...
|
||||||
|
webhook:
|
||||||
|
default_url: "https://your-app.com/webhook/default"
|
||||||
|
default_headers:
|
||||||
|
X-Webhook-Secret: "your-secret-token"
|
||||||
|
```
|
||||||
|
|
||||||
|
Then submit jobs without webhook config:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Uses the global webhook configuration
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/crawl/job",
|
||||||
|
json={"urls": ["https://example.com"]}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Webhook Best Practices
|
||||||
|
|
||||||
|
1. **Authentication:** Always use custom headers for webhook authentication
|
||||||
|
```json
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "your-secret-token"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Idempotency:** Design your webhook handler to be idempotent (safe to receive duplicate notifications)
|
||||||
|
|
||||||
|
3. **Fast Response:** Return HTTP 200 quickly; process data asynchronously if needed
|
||||||
|
```python
|
||||||
|
@app.route('/webhook', methods=['POST'])
|
||||||
|
def webhook():
|
||||||
|
payload = request.json
|
||||||
|
# Queue for background processing
|
||||||
|
queue.enqueue(process_webhook, payload)
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Error Handling:** Handle both success and failure notifications
|
||||||
|
```python
|
||||||
|
if payload['status'] == 'completed':
|
||||||
|
# Process success
|
||||||
|
elif payload['status'] == 'failed':
|
||||||
|
# Log error, retry, or alert
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Validation:** Verify webhook authenticity using custom headers
|
||||||
|
```python
|
||||||
|
secret = request.headers.get('X-Webhook-Secret')
|
||||||
|
if secret != os.environ['EXPECTED_SECRET']:
|
||||||
|
return jsonify({"error": "Unauthorized"}), 401
|
||||||
|
```
|
||||||
|
|
||||||
|
6. **Logging:** Log webhook deliveries for debugging
|
||||||
|
```python
|
||||||
|
logger.info(f"Webhook received: {payload['task_id']} - {payload['status']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Use Cases
|
||||||
|
|
||||||
|
**1. Batch Processing**
|
||||||
|
Submit hundreds of URLs and get notified as each completes:
|
||||||
|
```python
|
||||||
|
urls = ["https://site1.com", "https://site2.com", ...]
|
||||||
|
for url in urls:
|
||||||
|
submit_crawl_job(url, webhook_url="https://app.com/webhook")
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. Microservice Integration**
|
||||||
|
Integrate with event-driven architectures:
|
||||||
|
```python
|
||||||
|
# Service A submits job
|
||||||
|
task_id = submit_crawl_job(url)
|
||||||
|
|
||||||
|
# Service B receives webhook and triggers next step
|
||||||
|
@app.route('/webhook')
|
||||||
|
def webhook():
|
||||||
|
process_result(request.json)
|
||||||
|
trigger_next_service()
|
||||||
|
return "OK", 200
|
||||||
|
```
|
||||||
|
|
||||||
|
**3. Long-Running Extractions**
|
||||||
|
Handle complex LLM extractions without timeouts:
|
||||||
|
```python
|
||||||
|
submit_llm_job(
|
||||||
|
url="https://long-article.com",
|
||||||
|
q="Comprehensive summary with key points and analysis",
|
||||||
|
webhook_url="https://app.com/webhook/llm"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Troubleshooting
|
||||||
|
|
||||||
|
**Webhook not receiving notifications?**
|
||||||
|
- Check your webhook URL is publicly accessible
|
||||||
|
- Verify firewall/security group settings
|
||||||
|
- Use webhook testing tools like webhook.site for debugging
|
||||||
|
- Check server logs for delivery attempts
|
||||||
|
- Ensure your handler returns 200-299 status code
|
||||||
|
|
||||||
|
**Job stuck in processing?**
|
||||||
|
- Check Redis connection: `docker logs <container_name> | grep redis`
|
||||||
|
- Verify worker processes: `docker exec <container_name> ps aux | grep worker`
|
||||||
|
- Check server logs: `docker logs <container_name>`
|
||||||
|
|
||||||
|
**Need to cancel a job?**
|
||||||
|
Jobs are processed asynchronously. If you need to cancel:
|
||||||
|
- Delete the task from Redis (requires Redis CLI access)
|
||||||
|
- Or implement a cancellation endpoint in your webhook handler
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Dockerfile Parameters
|
## Dockerfile Parameters
|
||||||
@@ -892,10 +1634,12 @@ This is the easiest way to translate Python configuration to JSON requests when
|
|||||||
|
|
||||||
Install the SDK: `pip install crawl4ai`
|
Install the SDK: `pip install crawl4ai`
|
||||||
|
|
||||||
|
The Python SDK provides a convenient way to interact with the Docker API, including **automatic hook conversion** when using function objects.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
|
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# Point to the correct server port
|
# Point to the correct server port
|
||||||
@@ -907,23 +1651,22 @@ async def main():
|
|||||||
print("--- Running Non-Streaming Crawl ---")
|
print("--- Running Non-Streaming Crawl ---")
|
||||||
results = await client.crawl(
|
results = await client.crawl(
|
||||||
["https://httpbin.org/html"],
|
["https://httpbin.org/html"],
|
||||||
browser_config=BrowserConfig(headless=True), # Use library classes for config aid
|
browser_config=BrowserConfig(headless=True),
|
||||||
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
)
|
)
|
||||||
if results: # client.crawl returns None on failure
|
if results:
|
||||||
print(f"Non-streaming results success: {results.success}")
|
print(f"Non-streaming results success: {results.success}")
|
||||||
if results.success:
|
if results.success:
|
||||||
for result in results: # Iterate through the CrawlResultContainer
|
for result in results:
|
||||||
print(f"URL: {result.url}, Success: {result.success}")
|
print(f"URL: {result.url}, Success: {result.success}")
|
||||||
else:
|
else:
|
||||||
print("Non-streaming crawl failed.")
|
print("Non-streaming crawl failed.")
|
||||||
|
|
||||||
|
|
||||||
# Example Streaming crawl
|
# Example Streaming crawl
|
||||||
print("\n--- Running Streaming Crawl ---")
|
print("\n--- Running Streaming Crawl ---")
|
||||||
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
|
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
|
||||||
try:
|
try:
|
||||||
async for result in await client.crawl( # client.crawl returns an async generator for streaming
|
async for result in await client.crawl(
|
||||||
["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
|
["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
|
||||||
browser_config=BrowserConfig(headless=True),
|
browser_config=BrowserConfig(headless=True),
|
||||||
crawler_config=stream_config
|
crawler_config=stream_config
|
||||||
@@ -932,17 +1675,56 @@ async def main():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Streaming crawl failed: {e}")
|
print(f"Streaming crawl failed: {e}")
|
||||||
|
|
||||||
|
# Example with hooks (Python function objects)
|
||||||
|
print("\n--- Crawl with Hooks ---")
|
||||||
|
|
||||||
|
async def my_hook(page, context, **kwargs):
|
||||||
|
"""Custom hook to optimize performance"""
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
await context.route("**/*.{png,jpg}", lambda r: r.abort())
|
||||||
|
print("[HOOK] Page optimized")
|
||||||
|
return page
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
browser_config=BrowserConfig(headless=True),
|
||||||
|
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||||
|
hooks={"on_page_context_created": my_hook}, # Pass function directly!
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
print(f"Crawl with hooks success: {result.success}")
|
||||||
|
|
||||||
# Example Get schema
|
# Example Get schema
|
||||||
print("\n--- Getting Schema ---")
|
print("\n--- Getting Schema ---")
|
||||||
schema = await client.get_schema()
|
schema = await client.get_schema()
|
||||||
print(f"Schema received: {bool(schema)}") # Print whether schema was received
|
print(f"Schema received: {bool(schema)}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
*(SDK parameters like timeout, verify_ssl etc. remain the same)*
|
#### SDK Parameters
|
||||||
|
|
||||||
|
The Docker client supports the following parameters:
|
||||||
|
|
||||||
|
**Client Initialization**:
|
||||||
|
- `base_url` (str): URL of the Docker server (default: `http://localhost:8000`)
|
||||||
|
- `timeout` (float): Request timeout in seconds (default: 30.0)
|
||||||
|
- `verify_ssl` (bool): Verify SSL certificates (default: True)
|
||||||
|
- `verbose` (bool): Enable verbose logging (default: True)
|
||||||
|
- `log_file` (Optional[str]): Path to log file (default: None)
|
||||||
|
|
||||||
|
**crawl() Method**:
|
||||||
|
- `urls` (List[str]): List of URLs to crawl
|
||||||
|
- `browser_config` (Optional[BrowserConfig]): Browser configuration
|
||||||
|
- `crawler_config` (Optional[CrawlerRunConfig]): Crawler configuration
|
||||||
|
- `hooks` (Optional[Dict]): Hook functions or strings - **automatically converts function objects!**
|
||||||
|
- `hooks_timeout` (int): Timeout for each hook execution in seconds (default: 30)
|
||||||
|
|
||||||
|
**Returns**:
|
||||||
|
- Single URL: `CrawlResult` object
|
||||||
|
- Multiple URLs: `List[CrawlResult]`
|
||||||
|
- Streaming: `AsyncGenerator[CrawlResult]`
|
||||||
|
|
||||||
### Second Approach: Direct API Calls
|
### Second Approach: Direct API Calls
|
||||||
|
|
||||||
@@ -1352,19 +2134,40 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
|
|||||||
|
|
||||||
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
||||||
- Building and running the Docker container
|
- Building and running the Docker container
|
||||||
- Configuring the environment
|
- Configuring the environment
|
||||||
- Using the interactive playground for testing
|
- Using the interactive playground for testing
|
||||||
- Making API requests with proper typing
|
- Making API requests with proper typing
|
||||||
- Using the Python SDK
|
- Using the Python SDK with **automatic hook conversion**
|
||||||
|
- **Working with hooks** - both string-based (REST API) and function-based (Python SDK)
|
||||||
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
||||||
- Connecting via the Model Context Protocol (MCP)
|
- Connecting via the Model Context Protocol (MCP)
|
||||||
- Monitoring your deployment
|
- Monitoring your deployment
|
||||||
|
|
||||||
The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests.
|
### Key Features
|
||||||
|
|
||||||
For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
|
**Hooks Support**: Crawl4AI offers two approaches for working with hooks:
|
||||||
|
- **String-based** (REST API): Works with any language, requires manual string formatting
|
||||||
|
- **Function-based** (Python SDK): Write hooks as regular Python functions with full IDE support and automatic conversion
|
||||||
|
|
||||||
Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
|
**Playground Interface**: The built-in playground at `http://localhost:11235/playground` makes it easy to test configurations and generate corresponding JSON for API requests.
|
||||||
|
|
||||||
|
**MCP Integration**: For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
|
||||||
|
|
||||||
|
### Next Steps
|
||||||
|
|
||||||
|
1. **Explore Examples**: Check out the comprehensive examples in:
|
||||||
|
- `/docs/examples/hooks_docker_client_example.py` - Python function-based hooks
|
||||||
|
- `/docs/examples/hooks_rest_api_example.py` - REST API string-based hooks
|
||||||
|
- `/docs/examples/README_HOOKS.md` - Comparison and guide
|
||||||
|
|
||||||
|
2. **Read Documentation**:
|
||||||
|
- `/docs/hooks-utility-guide.md` - Complete hooks utility guide
|
||||||
|
- API documentation for detailed configuration options
|
||||||
|
|
||||||
|
3. **Join the Community**:
|
||||||
|
- GitHub: Report issues and contribute
|
||||||
|
- Discord: Get help and share your experiences
|
||||||
|
- Documentation: Comprehensive guides and tutorials
|
||||||
|
|
||||||
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
|
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
|
||||||
|
|
||||||
|
|||||||
@@ -59,6 +59,27 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
|||||||
|
|
||||||
> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com).
|
> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com).
|
||||||
|
|
||||||
|
## 🆕 AI Assistant Skill Now Available!
|
||||||
|
|
||||||
|
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; margin: 20px 0; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
|
||||||
|
<h3 style="color: white; margin: 0 0 10px 0;">🤖 Crawl4AI Skill for Claude & AI Assistants</h3>
|
||||||
|
<p style="color: white; margin: 10px 0;">Supercharge your AI coding assistant with complete Crawl4AI knowledge! Download our comprehensive skill package that includes:</p>
|
||||||
|
<ul style="color: white; margin: 10px 0;">
|
||||||
|
<li>📚 Complete SDK reference (23K+ words)</li>
|
||||||
|
<li>🚀 Ready-to-use extraction scripts</li>
|
||||||
|
<li>⚡ Schema generation for efficient scraping</li>
|
||||||
|
<li>🔧 Version 0.7.4 compatible</li>
|
||||||
|
</ul>
|
||||||
|
<div style="text-align: center; margin-top: 15px;">
|
||||||
|
<a href="assets/crawl4ai-skill.zip" download style="background: white; color: #667eea; padding: 12px 30px; border-radius: 5px; text-decoration: none; font-weight: bold; display: inline-block; transition: transform 0.2s;">
|
||||||
|
📦 Download Skill Package
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
<p style="color: white; margin: 15px 0 0 0; font-size: 0.9em; text-align: center;">
|
||||||
|
Works with Claude, Cursor, Windsurf, and other AI coding assistants. Import the .zip file into your AI assistant's skill/knowledge system.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
## 🎯 New: Adaptive Web Crawling
|
## 🎯 New: Adaptive Web Crawling
|
||||||
|
|
||||||
Crawl4AI now features intelligent adaptive crawling that knows when to stop! Using advanced information foraging algorithms, it determines when sufficient information has been gathered to answer your query.
|
Crawl4AI now features intelligent adaptive crawling that knows when to stop! Using advanced information foraging algorithms, it determines when sufficient information has been gathered to answer your query.
|
||||||
|
|||||||
@@ -529,8 +529,19 @@ class AdminDashboard {
|
|||||||
</label>
|
</label>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group full-width">
|
<div class="form-group full-width">
|
||||||
<label>Integration Guide</label>
|
<label>Long Description (Markdown - Overview tab)</label>
|
||||||
<textarea id="form-integration" rows="10">${app?.integration_guide || ''}</textarea>
|
<textarea id="form-long-description" rows="10" placeholder="Enter detailed description with markdown formatting...">${app?.long_description || ''}</textarea>
|
||||||
|
<small>Markdown support: **bold**, *italic*, [links](url), # headers, code blocks, lists</small>
|
||||||
|
</div>
|
||||||
|
<div class="form-group full-width">
|
||||||
|
<label>Integration Guide (Markdown - Integration tab)</label>
|
||||||
|
<textarea id="form-integration" rows="20" placeholder="Enter integration guide with installation, examples, and code snippets using markdown...">${app?.integration_guide || ''}</textarea>
|
||||||
|
<small>Single markdown field with installation, examples, and complete guide. Code blocks get auto copy buttons.</small>
|
||||||
|
</div>
|
||||||
|
<div class="form-group full-width">
|
||||||
|
<label>Documentation (Markdown - Documentation tab)</label>
|
||||||
|
<textarea id="form-documentation" rows="20" placeholder="Enter documentation with API reference, examples, and best practices using markdown...">${app?.documentation || ''}</textarea>
|
||||||
|
<small>Full documentation with API reference, examples, best practices, etc.</small>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
@@ -712,7 +723,9 @@ class AdminDashboard {
|
|||||||
data.contact_email = document.getElementById('form-email').value;
|
data.contact_email = document.getElementById('form-email').value;
|
||||||
data.featured = document.getElementById('form-featured').checked ? 1 : 0;
|
data.featured = document.getElementById('form-featured').checked ? 1 : 0;
|
||||||
data.sponsored = document.getElementById('form-sponsored').checked ? 1 : 0;
|
data.sponsored = document.getElementById('form-sponsored').checked ? 1 : 0;
|
||||||
|
data.long_description = document.getElementById('form-long-description').value;
|
||||||
data.integration_guide = document.getElementById('form-integration').value;
|
data.integration_guide = document.getElementById('form-integration').value;
|
||||||
|
data.documentation = document.getElementById('form-documentation').value;
|
||||||
} else if (type === 'articles') {
|
} else if (type === 'articles') {
|
||||||
data.title = document.getElementById('form-title').value;
|
data.title = document.getElementById('form-title').value;
|
||||||
data.slug = this.generateSlug(data.title);
|
data.slug = this.generateSlug(data.title);
|
||||||
|
|||||||
@@ -510,6 +510,31 @@
|
|||||||
line-height: 1.5;
|
line-height: 1.5;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Markdown rendered code blocks */
|
||||||
|
.integration-content pre,
|
||||||
|
.docs-content pre {
|
||||||
|
background: var(--bg-dark);
|
||||||
|
border: 1px solid var(--border-color);
|
||||||
|
margin: 1rem 0;
|
||||||
|
padding: 1rem;
|
||||||
|
padding-top: 2.5rem; /* Space for copy button */
|
||||||
|
overflow-x: auto;
|
||||||
|
position: relative;
|
||||||
|
max-height: none; /* Remove any height restrictions */
|
||||||
|
height: auto; /* Allow content to expand */
|
||||||
|
}
|
||||||
|
|
||||||
|
.integration-content pre code,
|
||||||
|
.docs-content pre code {
|
||||||
|
background: transparent;
|
||||||
|
padding: 0;
|
||||||
|
color: var(--text-secondary);
|
||||||
|
font-size: 0.875rem;
|
||||||
|
line-height: 1.5;
|
||||||
|
white-space: pre; /* Preserve whitespace and line breaks */
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
|
||||||
/* Feature Grid */
|
/* Feature Grid */
|
||||||
.feature-grid {
|
.feature-grid {
|
||||||
display: grid;
|
display: grid;
|
||||||
|
|||||||
@@ -80,20 +80,7 @@
|
|||||||
<section id="overview-tab" class="tab-content active">
|
<section id="overview-tab" class="tab-content active">
|
||||||
<div class="overview-columns">
|
<div class="overview-columns">
|
||||||
<div class="overview-main">
|
<div class="overview-main">
|
||||||
<h2>Overview</h2>
|
|
||||||
<div id="app-overview">Overview content goes here.</div>
|
<div id="app-overview">Overview content goes here.</div>
|
||||||
|
|
||||||
<h3>Key Features</h3>
|
|
||||||
<ul id="app-features" class="features-list">
|
|
||||||
<li>Feature 1</li>
|
|
||||||
<li>Feature 2</li>
|
|
||||||
<li>Feature 3</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h3>Use Cases</h3>
|
|
||||||
<div id="app-use-cases" class="use-cases">
|
|
||||||
<p>Describe how this app can help your workflow.</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<aside class="sidebar">
|
<aside class="sidebar">
|
||||||
@@ -142,33 +129,14 @@
|
|||||||
</section>
|
</section>
|
||||||
|
|
||||||
<section id="integration-tab" class="tab-content">
|
<section id="integration-tab" class="tab-content">
|
||||||
<div class="integration-content">
|
<div class="integration-content" id="app-integration">
|
||||||
<h2>Integration Guide</h2>
|
<!-- Integration guide markdown content will be rendered here -->
|
||||||
|
|
||||||
<h3>Installation</h3>
|
|
||||||
<div class="code-block">
|
|
||||||
<pre><code id="install-code"># Installation instructions will appear here</code></pre>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<h3>Basic Usage</h3>
|
|
||||||
<div class="code-block">
|
|
||||||
<pre><code id="usage-code"># Usage example will appear here</code></pre>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<h3>Complete Integration Example</h3>
|
|
||||||
<div class="code-block">
|
|
||||||
<button class="copy-btn" id="copy-integration">Copy</button>
|
|
||||||
<pre><code id="integration-code"># Complete integration guide will appear here</code></pre>
|
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<section id="docs-tab" class="tab-content">
|
<section id="docs-tab" class="tab-content">
|
||||||
<div class="docs-content">
|
<div class="docs-content" id="app-docs">
|
||||||
<h2>Documentation</h2>
|
<!-- Documentation markdown content will be rendered here -->
|
||||||
<div id="app-docs" class="doc-sections">
|
|
||||||
<p>Documentation coming soon.</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
|||||||
@@ -123,144 +123,132 @@ class AppDetailPage {
|
|||||||
document.getElementById('sidebar-pricing').textContent = this.appData.pricing || 'Free';
|
document.getElementById('sidebar-pricing').textContent = this.appData.pricing || 'Free';
|
||||||
document.getElementById('sidebar-contact').textContent = this.appData.contact_email || 'contact@example.com';
|
document.getElementById('sidebar-contact').textContent = this.appData.contact_email || 'contact@example.com';
|
||||||
|
|
||||||
// Integration guide
|
// Render tab contents from database fields
|
||||||
this.renderIntegrationGuide();
|
this.renderTabContents();
|
||||||
}
|
}
|
||||||
|
|
||||||
renderIntegrationGuide() {
|
renderTabContents() {
|
||||||
// Installation code
|
// Overview tab - use long_description from database
|
||||||
const installCode = document.getElementById('install-code');
|
const overviewDiv = document.getElementById('app-overview');
|
||||||
if (installCode) {
|
if (overviewDiv) {
|
||||||
if (this.appData.type === 'Open Source' && this.appData.github_url) {
|
if (this.appData.long_description) {
|
||||||
installCode.textContent = `# Clone from GitHub
|
overviewDiv.innerHTML = this.renderMarkdown(this.appData.long_description);
|
||||||
git clone ${this.appData.github_url}
|
} else {
|
||||||
|
overviewDiv.innerHTML = `<p>${this.appData.description || 'No overview available.'}</p>`;
|
||||||
# Install dependencies
|
|
||||||
pip install -r requirements.txt`;
|
|
||||||
} else if (this.appData.name.toLowerCase().includes('api')) {
|
|
||||||
installCode.textContent = `# Install via pip
|
|
||||||
pip install ${this.appData.slug}
|
|
||||||
|
|
||||||
# Or install from source
|
|
||||||
pip install git+${this.appData.github_url || 'https://github.com/example/repo'}`;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Usage code - customize based on category
|
// Integration tab - use integration_guide field from database
|
||||||
const usageCode = document.getElementById('usage-code');
|
const integrationDiv = document.getElementById('app-integration');
|
||||||
if (usageCode) {
|
if (integrationDiv) {
|
||||||
if (this.appData.category === 'Browser Automation') {
|
if (this.appData.integration_guide) {
|
||||||
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
integrationDiv.innerHTML = this.renderMarkdown(this.appData.integration_guide);
|
||||||
from ${this.appData.slug.replace(/-/g, '_')} import ${this.appData.name.replace(/\s+/g, '')}
|
// Add copy buttons to all code blocks
|
||||||
|
this.addCopyButtonsToCodeBlocks(integrationDiv);
|
||||||
async def main():
|
} else {
|
||||||
# Initialize ${this.appData.name}
|
integrationDiv.innerHTML = '<p>Integration guide not yet available. Please check the official website for details.</p>';
|
||||||
automation = ${this.appData.name.replace(/\s+/g, '')}()
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
url="https://example.com",
|
|
||||||
browser_config=automation.config,
|
|
||||||
wait_for="css:body"
|
|
||||||
)
|
|
||||||
print(result.markdown)`;
|
|
||||||
} else if (this.appData.category === 'Proxy Services') {
|
|
||||||
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
|
||||||
import ${this.appData.slug.replace(/-/g, '_')}
|
|
||||||
|
|
||||||
# Configure proxy
|
|
||||||
proxy_config = {
|
|
||||||
"server": "${this.appData.website_url || 'https://proxy.example.com'}",
|
|
||||||
"username": "your_username",
|
|
||||||
"password": "your_password"
|
|
||||||
}
|
|
||||||
|
|
||||||
async with AsyncWebCrawler(proxy=proxy_config) as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
url="https://example.com",
|
|
||||||
bypass_cache=True
|
|
||||||
)
|
|
||||||
print(result.status_code)`;
|
|
||||||
} else if (this.appData.category === 'LLM Integration') {
|
|
||||||
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
|
||||||
|
|
||||||
# Configure LLM extraction
|
|
||||||
strategy = LLMExtractionStrategy(
|
|
||||||
provider="${this.appData.name.toLowerCase().includes('gpt') ? 'openai' : 'anthropic'}",
|
|
||||||
api_key="your-api-key",
|
|
||||||
model="${this.appData.name.toLowerCase().includes('gpt') ? 'gpt-4' : 'claude-3'}",
|
|
||||||
instruction="Extract structured data"
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
url="https://example.com",
|
|
||||||
extraction_strategy=strategy
|
|
||||||
)
|
|
||||||
print(result.extracted_content)`;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Integration example
|
// Documentation tab - use documentation field from database
|
||||||
const integrationCode = document.getElementById('integration-code');
|
const docsDiv = document.getElementById('app-docs');
|
||||||
if (integrationCode) {
|
if (docsDiv) {
|
||||||
integrationCode.textContent = this.appData.integration_guide ||
|
if (this.appData.documentation) {
|
||||||
`# Complete ${this.appData.name} Integration Example
|
docsDiv.innerHTML = this.renderMarkdown(this.appData.documentation);
|
||||||
|
// Add copy buttons to all code blocks
|
||||||
from crawl4ai import AsyncWebCrawler
|
this.addCopyButtonsToCodeBlocks(docsDiv);
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
} else {
|
||||||
import json
|
docsDiv.innerHTML = '<p>Documentation coming soon.</p>';
|
||||||
|
}
|
||||||
async def crawl_with_${this.appData.slug.replace(/-/g, '_')}():
|
}
|
||||||
"""
|
|
||||||
Complete example showing how to use ${this.appData.name}
|
|
||||||
with Crawl4AI for production web scraping
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Define extraction schema
|
|
||||||
schema = {
|
|
||||||
"name": "ProductList",
|
|
||||||
"baseSelector": "div.product",
|
|
||||||
"fields": [
|
|
||||||
{"name": "title", "selector": "h2", "type": "text"},
|
|
||||||
{"name": "price", "selector": ".price", "type": "text"},
|
|
||||||
{"name": "image", "selector": "img", "type": "attribute", "attribute": "src"},
|
|
||||||
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Initialize crawler with ${this.appData.name}
|
addCopyButtonsToCodeBlocks(container) {
|
||||||
async with AsyncWebCrawler(
|
// Find all code blocks and add copy buttons
|
||||||
browser_type="chromium",
|
const codeBlocks = container.querySelectorAll('pre code');
|
||||||
headless=True,
|
codeBlocks.forEach(codeBlock => {
|
||||||
verbose=True
|
const pre = codeBlock.parentElement;
|
||||||
) as crawler:
|
|
||||||
|
|
||||||
# Crawl with extraction
|
// Skip if already has a copy button
|
||||||
result = await crawler.arun(
|
if (pre.querySelector('.copy-btn')) return;
|
||||||
url="https://example.com/products",
|
|
||||||
extraction_strategy=JsonCssExtractionStrategy(schema),
|
|
||||||
cache_mode="bypass",
|
|
||||||
wait_for="css:.product",
|
|
||||||
screenshot=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Process results
|
// Create copy button
|
||||||
if result.success:
|
const copyBtn = document.createElement('button');
|
||||||
products = json.loads(result.extracted_content)
|
copyBtn.className = 'copy-btn';
|
||||||
print(f"Found {len(products)} products")
|
copyBtn.textContent = 'Copy';
|
||||||
|
copyBtn.onclick = () => {
|
||||||
|
navigator.clipboard.writeText(codeBlock.textContent).then(() => {
|
||||||
|
copyBtn.textContent = '✓ Copied!';
|
||||||
|
setTimeout(() => {
|
||||||
|
copyBtn.textContent = 'Copy';
|
||||||
|
}, 2000);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
for product in products[:5]:
|
// Add button to pre element
|
||||||
print(f"- {product['title']}: {product['price']}")
|
pre.style.position = 'relative';
|
||||||
|
pre.insertBefore(copyBtn, codeBlock);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
return products
|
renderMarkdown(text) {
|
||||||
|
if (!text) return '';
|
||||||
|
|
||||||
# Run the crawler
|
// Store code blocks temporarily to protect them from processing
|
||||||
if __name__ == "__main__":
|
const codeBlocks = [];
|
||||||
import asyncio
|
let processed = text.replace(/```(\w+)?\n([\s\S]*?)```/g, (match, lang, code) => {
|
||||||
asyncio.run(crawl_with_${this.appData.slug.replace(/-/g, '_')}())`;
|
const placeholder = `___CODE_BLOCK_${codeBlocks.length}___`;
|
||||||
}
|
codeBlocks.push(`<pre><code class="language-${lang || ''}">${this.escapeHtml(code)}</code></pre>`);
|
||||||
|
return placeholder;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Store inline code temporarily
|
||||||
|
const inlineCodes = [];
|
||||||
|
processed = processed.replace(/`([^`]+)`/g, (match, code) => {
|
||||||
|
const placeholder = `___INLINE_CODE_${inlineCodes.length}___`;
|
||||||
|
inlineCodes.push(`<code>${this.escapeHtml(code)}</code>`);
|
||||||
|
return placeholder;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Now process the rest of the markdown
|
||||||
|
processed = processed
|
||||||
|
// Headers
|
||||||
|
.replace(/^### (.*$)/gim, '<h3>$1</h3>')
|
||||||
|
.replace(/^## (.*$)/gim, '<h2>$1</h2>')
|
||||||
|
.replace(/^# (.*$)/gim, '<h1>$1</h1>')
|
||||||
|
// Bold
|
||||||
|
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
|
||||||
|
// Italic
|
||||||
|
.replace(/\*(.*?)\*/g, '<em>$1</em>')
|
||||||
|
// Links
|
||||||
|
.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2" target="_blank">$1</a>')
|
||||||
|
// Line breaks
|
||||||
|
.replace(/\n\n/g, '</p><p>')
|
||||||
|
.replace(/\n/g, '<br>')
|
||||||
|
// Lists
|
||||||
|
.replace(/^\* (.*)$/gim, '<li>$1</li>')
|
||||||
|
.replace(/^- (.*)$/gim, '<li>$1</li>')
|
||||||
|
// Wrap in paragraphs
|
||||||
|
.replace(/^(?!<[h|p|pre|ul|ol|li])/gim, '<p>')
|
||||||
|
.replace(/(?<![>])$/gim, '</p>');
|
||||||
|
|
||||||
|
// Restore inline code
|
||||||
|
inlineCodes.forEach((code, i) => {
|
||||||
|
processed = processed.replace(`___INLINE_CODE_${i}___`, code);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Restore code blocks
|
||||||
|
codeBlocks.forEach((block, i) => {
|
||||||
|
processed = processed.replace(`___CODE_BLOCK_${i}___`, block);
|
||||||
|
});
|
||||||
|
|
||||||
|
return processed;
|
||||||
|
}
|
||||||
|
|
||||||
|
escapeHtml(text) {
|
||||||
|
const div = document.createElement('div');
|
||||||
|
div.textContent = text;
|
||||||
|
return div.innerHTML;
|
||||||
}
|
}
|
||||||
|
|
||||||
formatNumber(num) {
|
formatNumber(num) {
|
||||||
@@ -289,33 +277,6 @@ if __name__ == "__main__":
|
|||||||
document.getElementById(`${tabName}-tab`).classList.add('active');
|
document.getElementById(`${tabName}-tab`).classList.add('active');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// Copy integration code
|
|
||||||
document.getElementById('copy-integration').addEventListener('click', () => {
|
|
||||||
const code = document.getElementById('integration-code').textContent;
|
|
||||||
navigator.clipboard.writeText(code).then(() => {
|
|
||||||
const btn = document.getElementById('copy-integration');
|
|
||||||
const originalText = btn.innerHTML;
|
|
||||||
btn.innerHTML = '<span>✓</span> Copied!';
|
|
||||||
setTimeout(() => {
|
|
||||||
btn.innerHTML = originalText;
|
|
||||||
}, 2000);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
// Copy code buttons
|
|
||||||
document.querySelectorAll('.copy-btn').forEach(btn => {
|
|
||||||
btn.addEventListener('click', (e) => {
|
|
||||||
const codeBlock = e.target.closest('.code-block');
|
|
||||||
const code = codeBlock.querySelector('code').textContent;
|
|
||||||
navigator.clipboard.writeText(code).then(() => {
|
|
||||||
btn.textContent = 'Copied!';
|
|
||||||
setTimeout(() => {
|
|
||||||
btn.textContent = 'Copy';
|
|
||||||
}, 2000);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async loadRelatedApps() {
|
async loadRelatedApps() {
|
||||||
|
|||||||
338
docs/releases_review/demo_v0.7.5.py
Normal file
338
docs/releases_review/demo_v0.7.5.py
Normal file
@@ -0,0 +1,338 @@
|
|||||||
|
"""
|
||||||
|
🚀 Crawl4AI v0.7.5 Release Demo - Working Examples
|
||||||
|
==================================================
|
||||||
|
This demo showcases key features introduced in v0.7.5 with real, executable examples.
|
||||||
|
|
||||||
|
Featured Demos:
|
||||||
|
1. ✅ Docker Hooks System - Real API calls with custom hooks (string & function-based)
|
||||||
|
2. ✅ Enhanced LLM Integration - Working LLM configurations
|
||||||
|
3. ✅ HTTPS Preservation - Live crawling with HTTPS maintenance
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- crawl4ai v0.7.5 installed
|
||||||
|
- Docker running with crawl4ai image (optional for Docker demos)
|
||||||
|
- Valid API keys for LLM demos (optional)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from crawl4ai import (AsyncWebCrawler, CrawlerRunConfig, BrowserConfig,
|
||||||
|
CacheMode, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy,
|
||||||
|
hooks_to_string)
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
|
||||||
|
def print_section(title: str, description: str = ""):
|
||||||
|
"""Print a section header"""
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"{title}")
|
||||||
|
if description:
|
||||||
|
print(f"{description}")
|
||||||
|
print(f"{'=' * 60}\n")
|
||||||
|
|
||||||
|
|
||||||
|
async def demo_1_docker_hooks_system():
|
||||||
|
"""Demo 1: Docker Hooks System - Real API calls with custom hooks"""
|
||||||
|
print_section(
|
||||||
|
"Demo 1: Docker Hooks System",
|
||||||
|
"Testing both string-based and function-based hooks (NEW in v0.7.5!)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check Docker service availability
|
||||||
|
def check_docker_service():
|
||||||
|
try:
|
||||||
|
response = requests.get("http://localhost:11235/", timeout=3)
|
||||||
|
return response.status_code == 200
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("Checking Docker service...")
|
||||||
|
docker_running = check_docker_service()
|
||||||
|
|
||||||
|
if not docker_running:
|
||||||
|
print("⚠️ Docker service not running on localhost:11235")
|
||||||
|
print("To test Docker hooks:")
|
||||||
|
print("1. Run: docker run -p 11235:11235 unclecode/crawl4ai:latest")
|
||||||
|
print("2. Wait for service to start")
|
||||||
|
print("3. Re-run this demo\n")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("✓ Docker service detected!")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# PART 1: Traditional String-Based Hooks (Works with REST API)
|
||||||
|
# ============================================================================
|
||||||
|
print("\n" + "─" * 60)
|
||||||
|
print("Part 1: String-Based Hooks (REST API)")
|
||||||
|
print("─" * 60)
|
||||||
|
|
||||||
|
hooks_config_string = {
|
||||||
|
"on_page_context_created": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("[String Hook] Setting up page context")
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
"before_retrieve_html": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("[String Hook] Before retrieving HTML")
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"hooks": {
|
||||||
|
"code": hooks_config_string,
|
||||||
|
"timeout": 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print("🔧 Using string-based hooks for REST API...")
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json=payload, timeout=60)
|
||||||
|
execution_time = time.time() - start_time
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
print(f"✅ String-based hooks executed in {execution_time:.2f}s")
|
||||||
|
if result.get('results') and result['results'][0].get('success'):
|
||||||
|
html_length = len(result['results'][0].get('html', ''))
|
||||||
|
print(f" 📄 HTML length: {html_length} characters")
|
||||||
|
else:
|
||||||
|
print(f"❌ Request failed: {response.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {str(e)}")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# PART 2: NEW Function-Based Hooks with Docker Client (v0.7.5)
|
||||||
|
# ============================================================================
|
||||||
|
print("\n" + "─" * 60)
|
||||||
|
print("Part 2: Function-Based Hooks with Docker Client (✨ NEW!)")
|
||||||
|
print("─" * 60)
|
||||||
|
|
||||||
|
# Define hooks as regular Python functions
|
||||||
|
async def on_page_context_created_func(page, context, **kwargs):
|
||||||
|
"""Block images to speed up crawling"""
|
||||||
|
print("[Function Hook] Setting up page context")
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_goto_func(page, context, url, **kwargs):
|
||||||
|
"""Add custom headers before navigation"""
|
||||||
|
print(f"[Function Hook] About to navigate to {url}")
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Crawl4AI': 'v0.7.5-function-hooks',
|
||||||
|
'X-Test-Header': 'demo'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_retrieve_html_func(page, context, **kwargs):
|
||||||
|
"""Scroll to load lazy content"""
|
||||||
|
print("[Function Hook] Scrolling page for lazy-loaded content")
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(500)
|
||||||
|
await page.evaluate("window.scrollTo(0, 0)")
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Use the hooks_to_string utility (can be used standalone)
|
||||||
|
print("\n📦 Converting functions to strings with hooks_to_string()...")
|
||||||
|
hooks_as_strings = hooks_to_string({
|
||||||
|
"on_page_context_created": on_page_context_created_func,
|
||||||
|
"before_goto": before_goto_func,
|
||||||
|
"before_retrieve_html": before_retrieve_html_func
|
||||||
|
})
|
||||||
|
print(f" ✓ Converted {len(hooks_as_strings)} hooks to string format")
|
||||||
|
|
||||||
|
# OR use Docker Client which does conversion automatically!
|
||||||
|
print("\n🐳 Using Docker Client with automatic conversion...")
|
||||||
|
try:
|
||||||
|
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||||
|
|
||||||
|
# Pass function objects directly - conversion happens automatically!
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=["https://httpbin.org/html"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": on_page_context_created_func,
|
||||||
|
"before_goto": before_goto_func,
|
||||||
|
"before_retrieve_html": before_retrieve_html_func
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if results and results.success:
|
||||||
|
print(f"✅ Function-based hooks executed successfully!")
|
||||||
|
print(f" 📄 HTML length: {len(results.html)} characters")
|
||||||
|
print(f" 🎯 URL: {results.url}")
|
||||||
|
else:
|
||||||
|
print("⚠️ Crawl completed but may have warnings")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Docker client error: {str(e)}")
|
||||||
|
|
||||||
|
# Show the benefits
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("✨ Benefits of Function-Based Hooks:")
|
||||||
|
print("=" * 60)
|
||||||
|
print("✓ Full IDE support (autocomplete, syntax highlighting)")
|
||||||
|
print("✓ Type checking and linting")
|
||||||
|
print("✓ Easier to test and debug")
|
||||||
|
print("✓ Reusable across projects")
|
||||||
|
print("✓ Automatic conversion in Docker client")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
async def demo_2_enhanced_llm_integration():
|
||||||
|
"""Demo 2: Enhanced LLM Integration - Working LLM configurations"""
|
||||||
|
print_section(
|
||||||
|
"Demo 2: Enhanced LLM Integration",
|
||||||
|
"Testing custom LLM providers and configurations"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("🤖 Testing Enhanced LLM Integration Features")
|
||||||
|
|
||||||
|
provider = "gemini/gemini-2.5-flash-lite"
|
||||||
|
payload = {
|
||||||
|
"url": "https://example.com",
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Summarize this page in one sentence.",
|
||||||
|
"provider": provider, # Explicitly set provider
|
||||||
|
"temperature": 0.7
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/md",
|
||||||
|
json=payload,
|
||||||
|
timeout=60
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
print(f"✓ Request successful with provider: {provider}")
|
||||||
|
print(f" - Response keys: {list(result.keys())}")
|
||||||
|
print(f" - Content length: {len(result.get('markdown', ''))} characters")
|
||||||
|
print(f" - Note: Actual LLM call may fail without valid API key")
|
||||||
|
else:
|
||||||
|
print(f"❌ Request failed: {response.status_code}")
|
||||||
|
print(f" - Response: {response.text[:500]}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[red]Error: {e}[/]")
|
||||||
|
|
||||||
|
|
||||||
|
async def demo_3_https_preservation():
|
||||||
|
"""Demo 3: HTTPS Preservation - Live crawling with HTTPS maintenance"""
|
||||||
|
print_section(
|
||||||
|
"Demo 3: HTTPS Preservation",
|
||||||
|
"Testing HTTPS preservation for internal links"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("🔒 Testing HTTPS Preservation Feature")
|
||||||
|
|
||||||
|
# Test with HTTPS preservation enabled
|
||||||
|
print("\nTest 1: HTTPS Preservation ENABLED")
|
||||||
|
|
||||||
|
url_filter = URLPatternFilter(
|
||||||
|
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
||||||
|
)
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
exclude_external_links=True,
|
||||||
|
stream=True,
|
||||||
|
verbose=False,
|
||||||
|
preserve_https_for_internal_links=True,
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2,
|
||||||
|
max_pages=5,
|
||||||
|
filter_chain=FilterChain([url_filter])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
test_url = "https://quotes.toscrape.com"
|
||||||
|
print(f"🎯 Testing URL: {test_url}")
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
async for result in await crawler.arun(url=test_url, config=config):
|
||||||
|
print("✓ HTTPS Preservation Test Completed")
|
||||||
|
internal_links = [i['href'] for i in result.links['internal']]
|
||||||
|
for link in internal_links:
|
||||||
|
print(f" → {link}")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all demos"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("🚀 Crawl4AI v0.7.5 Working Demo")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Check system requirements
|
||||||
|
print("🔍 System Requirements Check:")
|
||||||
|
print(f" - Python version: {sys.version.split()[0]} {'✓' if sys.version_info >= (3, 10) else '❌ (3.10+ required)'}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
print(f" - Requests library: ✓")
|
||||||
|
except ImportError:
|
||||||
|
print(f" - Requests library: ❌")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
demos = [
|
||||||
|
("Docker Hooks System", demo_1_docker_hooks_system),
|
||||||
|
("Enhanced LLM Integration", demo_2_enhanced_llm_integration),
|
||||||
|
("HTTPS Preservation", demo_3_https_preservation),
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, (name, demo_func) in enumerate(demos, 1):
|
||||||
|
try:
|
||||||
|
print(f"\n📍 Starting Demo {i}/{len(demos)}: {name}")
|
||||||
|
await demo_func()
|
||||||
|
|
||||||
|
if i < len(demos):
|
||||||
|
print(f"\n✨ Demo {i} complete! Press Enter for next demo...")
|
||||||
|
input()
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print(f"\n⏹️ Demo interrupted by user")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Demo {i} error: {str(e)}")
|
||||||
|
print("Continuing to next demo...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("🎉 Demo Complete!")
|
||||||
|
print("=" * 60)
|
||||||
|
print("You've experienced the power of Crawl4AI v0.7.5!")
|
||||||
|
print("")
|
||||||
|
print("Key Features Demonstrated:")
|
||||||
|
print("🔧 Docker Hooks - String-based & function-based (NEW!)")
|
||||||
|
print(" • hooks_to_string() utility for function conversion")
|
||||||
|
print(" • Docker client with automatic conversion")
|
||||||
|
print(" • Full IDE support and type checking")
|
||||||
|
print("🤖 Enhanced LLM - Better AI integration")
|
||||||
|
print("🔒 HTTPS Preservation - Secure link handling")
|
||||||
|
print("")
|
||||||
|
print("Ready to build something amazing? 🚀")
|
||||||
|
print("")
|
||||||
|
print("📖 Docs: https://docs.crawl4ai.com/")
|
||||||
|
print("🐙 GitHub: https://github.com/unclecode/crawl4ai")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("🚀 Crawl4AI v0.7.5 Live Demo Starting...")
|
||||||
|
print("Press Ctrl+C anytime to exit\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(main())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n👋 Demo stopped by user. Thanks for trying Crawl4AI v0.7.5!")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Demo error: {str(e)}")
|
||||||
|
print("Make sure you have the required dependencies installed.")
|
||||||
359
docs/releases_review/demo_v0.7.6.py
Normal file
359
docs/releases_review/demo_v0.7.6.py
Normal file
@@ -0,0 +1,359 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Crawl4AI v0.7.6 Release Demo
|
||||||
|
============================
|
||||||
|
|
||||||
|
This demo showcases the major feature in v0.7.6:
|
||||||
|
**Webhook Support for Docker Job Queue API**
|
||||||
|
|
||||||
|
Features Demonstrated:
|
||||||
|
1. Asynchronous job processing with webhook notifications
|
||||||
|
2. Webhook support for /crawl/job endpoint
|
||||||
|
3. Webhook support for /llm/job endpoint
|
||||||
|
4. Notification-only vs data-in-payload modes
|
||||||
|
5. Custom webhook headers for authentication
|
||||||
|
6. Structured extraction with JSON schemas
|
||||||
|
7. Exponential backoff retry for reliable delivery
|
||||||
|
|
||||||
|
Prerequisites:
|
||||||
|
- Crawl4AI Docker container running on localhost:11235
|
||||||
|
- Flask installed: pip install flask requests
|
||||||
|
- LLM API key configured (for LLM examples)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python docs/releases_review/demo_v0.7.6.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CRAWL4AI_BASE_URL = "http://localhost:11235"
|
||||||
|
WEBHOOK_BASE_URL = "http://localhost:8080"
|
||||||
|
|
||||||
|
# Flask app for webhook receiver
|
||||||
|
app = Flask(__name__)
|
||||||
|
received_webhooks = []
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/webhook', methods=['POST'])
|
||||||
|
def webhook_handler():
|
||||||
|
"""Universal webhook handler for both crawl and LLM extraction jobs."""
|
||||||
|
payload = request.json
|
||||||
|
task_id = payload['task_id']
|
||||||
|
task_type = payload['task_type']
|
||||||
|
status = payload['status']
|
||||||
|
|
||||||
|
print(f"\n{'='*70}")
|
||||||
|
print(f"📬 Webhook Received!")
|
||||||
|
print(f" Task ID: {task_id}")
|
||||||
|
print(f" Task Type: {task_type}")
|
||||||
|
print(f" Status: {status}")
|
||||||
|
print(f" Timestamp: {payload['timestamp']}")
|
||||||
|
|
||||||
|
if status == 'completed':
|
||||||
|
if 'data' in payload:
|
||||||
|
print(f" ✅ Data included in webhook")
|
||||||
|
if task_type == 'crawl':
|
||||||
|
results = payload['data'].get('results', [])
|
||||||
|
print(f" 📊 Crawled {len(results)} URL(s)")
|
||||||
|
elif task_type == 'llm_extraction':
|
||||||
|
extracted = payload['data'].get('extracted_content', {})
|
||||||
|
print(f" 🤖 Extracted: {json.dumps(extracted, indent=6)}")
|
||||||
|
else:
|
||||||
|
print(f" 📥 Notification only (fetch data separately)")
|
||||||
|
elif status == 'failed':
|
||||||
|
print(f" ❌ Error: {payload.get('error', 'Unknown')}")
|
||||||
|
|
||||||
|
print(f"{'='*70}\n")
|
||||||
|
received_webhooks.append(payload)
|
||||||
|
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
|
||||||
|
def start_webhook_server():
|
||||||
|
"""Start Flask webhook server in background."""
|
||||||
|
app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
|
||||||
|
|
||||||
|
|
||||||
|
def demo_1_crawl_webhook_notification_only():
|
||||||
|
"""Demo 1: Crawl job with webhook notification (data fetched separately)."""
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("DEMO 1: Crawl Job - Webhook Notification Only")
|
||||||
|
print("="*70)
|
||||||
|
print("Submitting crawl job with webhook notification...")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
||||||
|
"webhook_data_in_payload": False,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Demo": "v0.7.6",
|
||||||
|
"X-Type": "crawl"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(f"{CRAWL4AI_BASE_URL}/crawl/job", json=payload)
|
||||||
|
if response.ok:
|
||||||
|
task_id = response.json()['task_id']
|
||||||
|
print(f"✅ Job submitted: {task_id}")
|
||||||
|
print("⏳ Webhook will notify when complete...")
|
||||||
|
return task_id
|
||||||
|
else:
|
||||||
|
print(f"❌ Failed: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def demo_2_crawl_webhook_with_data():
|
||||||
|
"""Demo 2: Crawl job with full data in webhook payload."""
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("DEMO 2: Crawl Job - Webhook with Full Data")
|
||||||
|
print("="*70)
|
||||||
|
print("Submitting crawl job with data included in webhook...")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://www.python.org"],
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
||||||
|
"webhook_data_in_payload": True,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Demo": "v0.7.6",
|
||||||
|
"X-Type": "crawl-with-data"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(f"{CRAWL4AI_BASE_URL}/crawl/job", json=payload)
|
||||||
|
if response.ok:
|
||||||
|
task_id = response.json()['task_id']
|
||||||
|
print(f"✅ Job submitted: {task_id}")
|
||||||
|
print("⏳ Webhook will include full results...")
|
||||||
|
return task_id
|
||||||
|
else:
|
||||||
|
print(f"❌ Failed: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def demo_3_llm_webhook_notification_only():
|
||||||
|
"""Demo 3: LLM extraction with webhook notification (NEW in v0.7.6!)."""
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("DEMO 3: LLM Extraction - Webhook Notification Only (NEW!)")
|
||||||
|
print("="*70)
|
||||||
|
print("Submitting LLM extraction job with webhook notification...")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"url": "https://www.example.com",
|
||||||
|
"q": "Extract the main heading and description from this page",
|
||||||
|
"provider": "openai/gpt-4o-mini",
|
||||||
|
"cache": False,
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
||||||
|
"webhook_data_in_payload": False,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Demo": "v0.7.6",
|
||||||
|
"X-Type": "llm"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(f"{CRAWL4AI_BASE_URL}/llm/job", json=payload)
|
||||||
|
if response.ok:
|
||||||
|
task_id = response.json()['task_id']
|
||||||
|
print(f"✅ Job submitted: {task_id}")
|
||||||
|
print("⏳ Webhook will notify when LLM extraction completes...")
|
||||||
|
return task_id
|
||||||
|
else:
|
||||||
|
print(f"❌ Failed: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def demo_4_llm_webhook_with_schema():
|
||||||
|
"""Demo 4: LLM extraction with JSON schema and data in webhook (NEW in v0.7.6!)."""
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("DEMO 4: LLM Extraction - Schema + Full Data in Webhook (NEW!)")
|
||||||
|
print("="*70)
|
||||||
|
print("Submitting LLM extraction with JSON schema...")
|
||||||
|
|
||||||
|
schema = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {"type": "string", "description": "Page title"},
|
||||||
|
"description": {"type": "string", "description": "Page description"},
|
||||||
|
"main_topics": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": "Main topics covered"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["title"]
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"url": "https://www.python.org",
|
||||||
|
"q": "Extract the title, description, and main topics from this website",
|
||||||
|
"schema": json.dumps(schema),
|
||||||
|
"provider": "openai/gpt-4o-mini",
|
||||||
|
"cache": False,
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
||||||
|
"webhook_data_in_payload": True,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Demo": "v0.7.6",
|
||||||
|
"X-Type": "llm-with-schema"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(f"{CRAWL4AI_BASE_URL}/llm/job", json=payload)
|
||||||
|
if response.ok:
|
||||||
|
task_id = response.json()['task_id']
|
||||||
|
print(f"✅ Job submitted: {task_id}")
|
||||||
|
print("⏳ Webhook will include structured extraction results...")
|
||||||
|
return task_id
|
||||||
|
else:
|
||||||
|
print(f"❌ Failed: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def demo_5_global_webhook_config():
|
||||||
|
"""Demo 5: Using global webhook configuration from config.yml."""
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("DEMO 5: Global Webhook Configuration")
|
||||||
|
print("="*70)
|
||||||
|
print("💡 You can configure a default webhook URL in config.yml:")
|
||||||
|
print("""
|
||||||
|
webhooks:
|
||||||
|
enabled: true
|
||||||
|
default_url: "https://myapp.com/webhooks/default"
|
||||||
|
data_in_payload: false
|
||||||
|
retry:
|
||||||
|
max_attempts: 5
|
||||||
|
initial_delay_ms: 1000
|
||||||
|
max_delay_ms: 32000
|
||||||
|
timeout_ms: 30000
|
||||||
|
""")
|
||||||
|
print("Then submit jobs WITHOUT webhook_config - they'll use the default!")
|
||||||
|
print("This is useful for consistent webhook handling across all jobs.")
|
||||||
|
|
||||||
|
|
||||||
|
def demo_6_webhook_retry_logic():
|
||||||
|
"""Demo 6: Webhook retry mechanism with exponential backoff."""
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("DEMO 6: Webhook Retry Logic")
|
||||||
|
print("="*70)
|
||||||
|
print("🔄 Webhook delivery uses exponential backoff retry:")
|
||||||
|
print(" • Max attempts: 5")
|
||||||
|
print(" • Delays: 1s → 2s → 4s → 8s → 16s")
|
||||||
|
print(" • Timeout: 30s per attempt")
|
||||||
|
print(" • Retries on: 5xx errors, network errors, timeouts")
|
||||||
|
print(" • No retry on: 4xx client errors")
|
||||||
|
print("\nThis ensures reliable webhook delivery even with temporary failures!")
|
||||||
|
|
||||||
|
|
||||||
|
def print_summary():
|
||||||
|
"""Print demo summary and results."""
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("📊 DEMO SUMMARY")
|
||||||
|
print("="*70)
|
||||||
|
print(f"Total webhooks received: {len(received_webhooks)}")
|
||||||
|
|
||||||
|
crawl_webhooks = [w for w in received_webhooks if w['task_type'] == 'crawl']
|
||||||
|
llm_webhooks = [w for w in received_webhooks if w['task_type'] == 'llm_extraction']
|
||||||
|
|
||||||
|
print(f"\nBreakdown:")
|
||||||
|
print(f" 🕷️ Crawl jobs: {len(crawl_webhooks)}")
|
||||||
|
print(f" 🤖 LLM extraction jobs: {len(llm_webhooks)}")
|
||||||
|
|
||||||
|
print(f"\nDetails:")
|
||||||
|
for i, webhook in enumerate(received_webhooks, 1):
|
||||||
|
icon = "🕷️" if webhook['task_type'] == 'crawl' else "🤖"
|
||||||
|
print(f" {i}. {icon} {webhook['task_id']}: {webhook['status']}")
|
||||||
|
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("✨ v0.7.6 KEY FEATURES DEMONSTRATED:")
|
||||||
|
print("="*70)
|
||||||
|
print("✅ Webhook support for /crawl/job")
|
||||||
|
print("✅ Webhook support for /llm/job (NEW!)")
|
||||||
|
print("✅ Notification-only mode (fetch data separately)")
|
||||||
|
print("✅ Data-in-payload mode (get full results in webhook)")
|
||||||
|
print("✅ Custom headers for authentication")
|
||||||
|
print("✅ JSON schema for structured LLM extraction")
|
||||||
|
print("✅ Exponential backoff retry for reliable delivery")
|
||||||
|
print("✅ Global webhook configuration support")
|
||||||
|
print("✅ Universal webhook handler for both job types")
|
||||||
|
print("\n💡 Benefits:")
|
||||||
|
print(" • No more polling - get instant notifications")
|
||||||
|
print(" • Better resource utilization")
|
||||||
|
print(" • Reliable delivery with automatic retries")
|
||||||
|
print(" • Consistent API across crawl and LLM jobs")
|
||||||
|
print(" • Production-ready webhook infrastructure")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all demos."""
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("🚀 Crawl4AI v0.7.6 Release Demo")
|
||||||
|
print("="*70)
|
||||||
|
print("Feature: Webhook Support for Docker Job Queue API")
|
||||||
|
print("="*70)
|
||||||
|
|
||||||
|
# Check if server is running
|
||||||
|
try:
|
||||||
|
health = requests.get(f"{CRAWL4AI_BASE_URL}/health", timeout=5)
|
||||||
|
print(f"✅ Crawl4AI server is running")
|
||||||
|
except:
|
||||||
|
print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}")
|
||||||
|
print("Please start Docker container:")
|
||||||
|
print(" docker run -d -p 11235:11235 --env-file .llm.env unclecode/crawl4ai:0.7.6")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Start webhook server
|
||||||
|
print(f"\n🌐 Starting webhook server at {WEBHOOK_BASE_URL}...")
|
||||||
|
webhook_thread = Thread(target=start_webhook_server, daemon=True)
|
||||||
|
webhook_thread.start()
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Run demos
|
||||||
|
demo_1_crawl_webhook_notification_only()
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
demo_2_crawl_webhook_with_data()
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
demo_3_llm_webhook_notification_only()
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
demo_4_llm_webhook_with_schema()
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
demo_5_global_webhook_config()
|
||||||
|
demo_6_webhook_retry_logic()
|
||||||
|
|
||||||
|
# Wait for webhooks
|
||||||
|
print("\n⏳ Waiting for all webhooks to arrive...")
|
||||||
|
time.sleep(30)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print_summary()
|
||||||
|
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("✅ Demo completed!")
|
||||||
|
print("="*70)
|
||||||
|
print("\n📚 Documentation:")
|
||||||
|
print(" • deploy/docker/WEBHOOK_EXAMPLES.md")
|
||||||
|
print(" • docs/examples/docker_webhook_example.py")
|
||||||
|
print("\n🔗 Upgrade:")
|
||||||
|
print(" docker pull unclecode/crawl4ai:0.7.6")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
655
docs/releases_review/v0.7.5_docker_hooks_demo.py
Normal file
655
docs/releases_review/v0.7.5_docker_hooks_demo.py
Normal file
@@ -0,0 +1,655 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
🚀 Crawl4AI v0.7.5 - Docker Hooks System Complete Demonstration
|
||||||
|
================================================================
|
||||||
|
|
||||||
|
This file demonstrates the NEW Docker Hooks System introduced in v0.7.5.
|
||||||
|
|
||||||
|
The Docker Hooks System is a completely NEW feature that provides pipeline
|
||||||
|
customization through user-provided Python functions. It offers three approaches:
|
||||||
|
|
||||||
|
1. String-based hooks for REST API
|
||||||
|
2. hooks_to_string() utility to convert functions
|
||||||
|
3. Docker Client with automatic conversion (most convenient)
|
||||||
|
|
||||||
|
All three approaches are part of this NEW v0.7.5 feature!
|
||||||
|
|
||||||
|
Perfect for video recording and demonstration purposes.
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- Docker container running: docker run -p 11235:11235 unclecode/crawl4ai:latest
|
||||||
|
- crawl4ai v0.7.5 installed: pip install crawl4ai==0.7.5
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
# Import Crawl4AI components
|
||||||
|
from crawl4ai import hooks_to_string
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
DOCKER_URL = "http://localhost:11235"
|
||||||
|
# DOCKER_URL = "http://localhost:11234"
|
||||||
|
TEST_URLS = [
|
||||||
|
# "https://httpbin.org/html",
|
||||||
|
"https://www.kidocode.com",
|
||||||
|
"https://quotes.toscrape.com",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def print_section(title: str, description: str = ""):
|
||||||
|
"""Print a formatted section header"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print(f" {title}")
|
||||||
|
if description:
|
||||||
|
print(f" {description}")
|
||||||
|
print("=" * 70 + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def check_docker_service() -> bool:
|
||||||
|
"""Check if Docker service is running"""
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{DOCKER_URL}/health", timeout=3)
|
||||||
|
return response.status_code == 200
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# REUSABLE HOOK LIBRARY (NEW in v0.7.5)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def performance_optimization_hook(page, context, **kwargs):
|
||||||
|
"""
|
||||||
|
Performance Hook: Block unnecessary resources to speed up crawling
|
||||||
|
"""
|
||||||
|
print(" [Hook] 🚀 Optimizing performance - blocking images and ads...")
|
||||||
|
|
||||||
|
# Block images
|
||||||
|
await context.route(
|
||||||
|
"**/*.{png,jpg,jpeg,gif,webp,svg,ico}",
|
||||||
|
lambda route: route.abort()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Block ads and analytics
|
||||||
|
await context.route("**/analytics/*", lambda route: route.abort())
|
||||||
|
await context.route("**/ads/*", lambda route: route.abort())
|
||||||
|
await context.route("**/google-analytics.com/*", lambda route: route.abort())
|
||||||
|
|
||||||
|
print(" [Hook] ✓ Performance optimization applied")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def viewport_setup_hook(page, context, **kwargs):
|
||||||
|
"""
|
||||||
|
Viewport Hook: Set consistent viewport size for rendering
|
||||||
|
"""
|
||||||
|
print(" [Hook] 🖥️ Setting viewport to 1920x1080...")
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
print(" [Hook] ✓ Viewport configured")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def authentication_headers_hook(page, context, url, **kwargs):
|
||||||
|
"""
|
||||||
|
Headers Hook: Add custom authentication and tracking headers
|
||||||
|
"""
|
||||||
|
print(f" [Hook] 🔐 Adding custom headers for {url[:50]}...")
|
||||||
|
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Crawl4AI-Version': '0.7.5',
|
||||||
|
'X-Custom-Hook': 'function-based-demo',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
'User-Agent': 'Crawl4AI/0.7.5 (Educational Demo)'
|
||||||
|
})
|
||||||
|
|
||||||
|
print(" [Hook] ✓ Custom headers added")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def lazy_loading_handler_hook(page, context, **kwargs):
|
||||||
|
"""
|
||||||
|
Content Hook: Handle lazy-loaded content by scrolling
|
||||||
|
"""
|
||||||
|
print(" [Hook] 📜 Scrolling to load lazy content...")
|
||||||
|
|
||||||
|
# Scroll to bottom
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
|
||||||
|
# Scroll to middle
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)")
|
||||||
|
await page.wait_for_timeout(500)
|
||||||
|
|
||||||
|
# Scroll back to top
|
||||||
|
await page.evaluate("window.scrollTo(0, 0)")
|
||||||
|
await page.wait_for_timeout(500)
|
||||||
|
|
||||||
|
print(" [Hook] ✓ Lazy content loaded")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def page_analytics_hook(page, context, **kwargs):
|
||||||
|
"""
|
||||||
|
Analytics Hook: Log page metrics before extraction
|
||||||
|
"""
|
||||||
|
print(" [Hook] 📊 Collecting page analytics...")
|
||||||
|
|
||||||
|
metrics = await page.evaluate('''
|
||||||
|
() => ({
|
||||||
|
title: document.title,
|
||||||
|
images: document.images.length,
|
||||||
|
links: document.links.length,
|
||||||
|
scripts: document.scripts.length,
|
||||||
|
headings: document.querySelectorAll('h1, h2, h3').length,
|
||||||
|
paragraphs: document.querySelectorAll('p').length
|
||||||
|
})
|
||||||
|
''')
|
||||||
|
|
||||||
|
print(f" [Hook] 📈 Page: {metrics['title'][:50]}...")
|
||||||
|
print(f" Links: {metrics['links']}, Images: {metrics['images']}, "
|
||||||
|
f"Headings: {metrics['headings']}, Paragraphs: {metrics['paragraphs']}")
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DEMO 1: String-Based Hooks (NEW Docker Hooks System)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def demo_1_string_based_hooks():
|
||||||
|
"""
|
||||||
|
Demonstrate string-based hooks with REST API (part of NEW Docker Hooks System)
|
||||||
|
"""
|
||||||
|
print_section(
|
||||||
|
"DEMO 1: String-Based Hooks (REST API)",
|
||||||
|
"Part of the NEW Docker Hooks System - hooks as strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define hooks as strings
|
||||||
|
hooks_config = {
|
||||||
|
"on_page_context_created": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print(" [String Hook] Setting up page context...")
|
||||||
|
# Block images for performance
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_goto": """
|
||||||
|
async def hook(page, context, url, **kwargs):
|
||||||
|
print(f" [String Hook] Navigating to {url[:50]}...")
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Crawl4AI': 'string-based-hooks',
|
||||||
|
'X-Demo': 'v0.7.5'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_retrieve_html": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print(" [String Hook] Scrolling page...")
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Prepare request payload
|
||||||
|
payload = {
|
||||||
|
"urls": [TEST_URLS[0]],
|
||||||
|
"hooks": {
|
||||||
|
"code": hooks_config,
|
||||||
|
"timeout": 30
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"cache_mode": "bypass"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"🎯 Target URL: {TEST_URLS[0]}")
|
||||||
|
print(f"🔧 Configured {len(hooks_config)} string-based hooks")
|
||||||
|
print(f"📡 Sending request to Docker API...\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
|
||||||
|
execution_time = time.time() - start_time
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
|
||||||
|
|
||||||
|
# Display results
|
||||||
|
if result.get('results') and result['results'][0].get('success'):
|
||||||
|
crawl_result = result['results'][0]
|
||||||
|
html_length = len(crawl_result.get('html', ''))
|
||||||
|
markdown_length = len(crawl_result.get('markdown', ''))
|
||||||
|
|
||||||
|
print(f"\n📊 Results:")
|
||||||
|
print(f" • HTML length: {html_length:,} characters")
|
||||||
|
print(f" • Markdown length: {markdown_length:,} characters")
|
||||||
|
print(f" • URL: {crawl_result.get('url')}")
|
||||||
|
|
||||||
|
# Check hooks execution
|
||||||
|
if 'hooks' in result:
|
||||||
|
hooks_info = result['hooks']
|
||||||
|
print(f"\n🎣 Hooks Execution:")
|
||||||
|
print(f" • Status: {hooks_info['status']['status']}")
|
||||||
|
print(f" • Attached hooks: {len(hooks_info['status']['attached_hooks'])}")
|
||||||
|
|
||||||
|
if 'summary' in hooks_info:
|
||||||
|
summary = hooks_info['summary']
|
||||||
|
print(f" • Total executions: {summary['total_executions']}")
|
||||||
|
print(f" • Successful: {summary['successful']}")
|
||||||
|
print(f" • Success rate: {summary['success_rate']:.1f}%")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Crawl completed but no results")
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"❌ Request failed with status {response.status_code}")
|
||||||
|
print(f" Error: {response.text[:200]}")
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
print("⏰ Request timed out after 60 seconds")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {str(e)}")
|
||||||
|
|
||||||
|
print("\n" + "─" * 70)
|
||||||
|
print("✓ String-based hooks demo complete\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DEMO 2: Function-Based Hooks with hooks_to_string() Utility
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def demo_2_hooks_to_string_utility():
|
||||||
|
"""
|
||||||
|
Demonstrate the new hooks_to_string() utility for converting functions
|
||||||
|
"""
|
||||||
|
print_section(
|
||||||
|
"DEMO 2: hooks_to_string() Utility (NEW! ✨)",
|
||||||
|
"Convert Python functions to strings for REST API"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("📦 Creating hook functions...")
|
||||||
|
print(" • performance_optimization_hook")
|
||||||
|
print(" • viewport_setup_hook")
|
||||||
|
print(" • authentication_headers_hook")
|
||||||
|
print(" • lazy_loading_handler_hook")
|
||||||
|
|
||||||
|
# Convert function objects to strings using the NEW utility
|
||||||
|
print("\n🔄 Converting functions to strings with hooks_to_string()...")
|
||||||
|
|
||||||
|
hooks_dict = {
|
||||||
|
"on_page_context_created": performance_optimization_hook,
|
||||||
|
"before_goto": authentication_headers_hook,
|
||||||
|
"before_retrieve_html": lazy_loading_handler_hook,
|
||||||
|
}
|
||||||
|
|
||||||
|
hooks_as_strings = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
print(f"✅ Successfully converted {len(hooks_as_strings)} functions to strings")
|
||||||
|
|
||||||
|
# Show a preview
|
||||||
|
print("\n📝 Sample converted hook (first 250 characters):")
|
||||||
|
print("─" * 70)
|
||||||
|
sample_hook = list(hooks_as_strings.values())[0]
|
||||||
|
print(sample_hook[:250] + "...")
|
||||||
|
print("─" * 70)
|
||||||
|
|
||||||
|
# Use the converted hooks with REST API
|
||||||
|
print("\n📡 Using converted hooks with REST API...")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"urls": [TEST_URLS[0]],
|
||||||
|
"hooks": {
|
||||||
|
"code": hooks_as_strings,
|
||||||
|
"timeout": 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
|
||||||
|
execution_time = time.time() - start_time
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
|
||||||
|
|
||||||
|
if result.get('results') and result['results'][0].get('success'):
|
||||||
|
crawl_result = result['results'][0]
|
||||||
|
print(f" • HTML length: {len(crawl_result.get('html', '')):,} characters")
|
||||||
|
print(f" • Hooks executed successfully!")
|
||||||
|
else:
|
||||||
|
print(f"❌ Request failed: {response.status_code}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {str(e)}")
|
||||||
|
|
||||||
|
print("\n💡 Benefits of hooks_to_string():")
|
||||||
|
print(" ✓ Write hooks as regular Python functions")
|
||||||
|
print(" ✓ Full IDE support (autocomplete, syntax highlighting)")
|
||||||
|
print(" ✓ Type checking and linting")
|
||||||
|
print(" ✓ Easy to test and debug")
|
||||||
|
print(" ✓ Reusable across projects")
|
||||||
|
print(" ✓ Works with any REST API client")
|
||||||
|
|
||||||
|
print("\n" + "─" * 70)
|
||||||
|
print("✓ hooks_to_string() utility demo complete\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DEMO 3: Docker Client with Automatic Conversion (RECOMMENDED! 🌟)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def demo_3_docker_client_auto_conversion():
|
||||||
|
"""
|
||||||
|
Demonstrate Docker Client with automatic hook conversion (RECOMMENDED)
|
||||||
|
"""
|
||||||
|
print_section(
|
||||||
|
"DEMO 3: Docker Client with Auto-Conversion (RECOMMENDED! 🌟)",
|
||||||
|
"Pass function objects directly - conversion happens automatically!"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("🐳 Initializing Crawl4AI Docker Client...")
|
||||||
|
client = Crawl4aiDockerClient(base_url=DOCKER_URL)
|
||||||
|
|
||||||
|
print("✅ Client ready!\n")
|
||||||
|
|
||||||
|
# Use our reusable hook library - just pass the function objects!
|
||||||
|
print("📚 Using reusable hook library:")
|
||||||
|
print(" • performance_optimization_hook")
|
||||||
|
print(" • viewport_setup_hook")
|
||||||
|
print(" • authentication_headers_hook")
|
||||||
|
print(" • lazy_loading_handler_hook")
|
||||||
|
print(" • page_analytics_hook")
|
||||||
|
|
||||||
|
print("\n🎯 Target URL: " + TEST_URLS[1])
|
||||||
|
print("🚀 Starting crawl with automatic hook conversion...\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Pass function objects directly - NO manual conversion needed! ✨
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=[TEST_URLS[0]],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": performance_optimization_hook,
|
||||||
|
"before_goto": authentication_headers_hook,
|
||||||
|
"before_retrieve_html": lazy_loading_handler_hook,
|
||||||
|
"before_return_html": page_analytics_hook,
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
execution_time = time.time() - start_time
|
||||||
|
|
||||||
|
print(f"\n✅ Crawl completed! (took {execution_time:.2f}s)\n")
|
||||||
|
|
||||||
|
# Display results
|
||||||
|
if results and results.success:
|
||||||
|
result = results
|
||||||
|
print(f"📊 Results:")
|
||||||
|
print(f" • URL: {result.url}")
|
||||||
|
print(f" • Success: {result.success}")
|
||||||
|
print(f" • HTML length: {len(result.html):,} characters")
|
||||||
|
print(f" • Markdown length: {len(result.markdown):,} characters")
|
||||||
|
|
||||||
|
# Show metadata
|
||||||
|
if result.metadata:
|
||||||
|
print(f"\n📋 Metadata:")
|
||||||
|
print(f" • Title: {result.metadata.get('title', 'N/A')}")
|
||||||
|
print(f" • Description: {result.metadata.get('description', 'N/A')}")
|
||||||
|
|
||||||
|
# Show links
|
||||||
|
if result.links:
|
||||||
|
internal_count = len(result.links.get('internal', []))
|
||||||
|
external_count = len(result.links.get('external', []))
|
||||||
|
print(f"\n🔗 Links Found:")
|
||||||
|
print(f" • Internal: {internal_count}")
|
||||||
|
print(f" • External: {external_count}")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Crawl completed but no successful results")
|
||||||
|
if results:
|
||||||
|
print(f" Error: {results.error_message}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
print("\n🌟 Why Docker Client is RECOMMENDED:")
|
||||||
|
print(" ✓ Automatic function-to-string conversion")
|
||||||
|
print(" ✓ No manual hooks_to_string() calls needed")
|
||||||
|
print(" ✓ Cleaner, more Pythonic code")
|
||||||
|
print(" ✓ Full type hints and IDE support")
|
||||||
|
print(" ✓ Built-in error handling")
|
||||||
|
print(" ✓ Async/await support")
|
||||||
|
|
||||||
|
print("\n" + "─" * 70)
|
||||||
|
print("✓ Docker Client auto-conversion demo complete\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DEMO 4: Advanced Use Case - Complete Hook Pipeline
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def demo_4_complete_hook_pipeline():
|
||||||
|
"""
|
||||||
|
Demonstrate a complete hook pipeline using all 8 hook points
|
||||||
|
"""
|
||||||
|
print_section(
|
||||||
|
"DEMO 4: Complete Hook Pipeline",
|
||||||
|
"Using all 8 available hook points for comprehensive control"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define all 8 hooks
|
||||||
|
async def on_browser_created_hook(browser, **kwargs):
|
||||||
|
"""Hook 1: Called after browser is created"""
|
||||||
|
print(" [Pipeline] 1/8 Browser created")
|
||||||
|
return browser
|
||||||
|
|
||||||
|
async def on_page_context_created_hook(page, context, **kwargs):
|
||||||
|
"""Hook 2: Called after page context is created"""
|
||||||
|
print(" [Pipeline] 2/8 Page context created - setting up...")
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def on_user_agent_updated_hook(page, context, user_agent, **kwargs):
|
||||||
|
"""Hook 3: Called when user agent is updated"""
|
||||||
|
print(f" [Pipeline] 3/8 User agent updated: {user_agent[:50]}...")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_goto_hook(page, context, url, **kwargs):
|
||||||
|
"""Hook 4: Called before navigating to URL"""
|
||||||
|
print(f" [Pipeline] 4/8 Before navigation to: {url[:60]}...")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def after_goto_hook(page, context, url, response, **kwargs):
|
||||||
|
"""Hook 5: Called after navigation completes"""
|
||||||
|
print(f" [Pipeline] 5/8 After navigation - Status: {response.status if response else 'N/A'}")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def on_execution_started_hook(page, context, **kwargs):
|
||||||
|
"""Hook 6: Called when JavaScript execution starts"""
|
||||||
|
print(" [Pipeline] 6/8 JavaScript execution started")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_retrieve_html_hook(page, context, **kwargs):
|
||||||
|
"""Hook 7: Called before retrieving HTML"""
|
||||||
|
print(" [Pipeline] 7/8 Before HTML retrieval - scrolling...")
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_return_html_hook(page, context, html, **kwargs):
|
||||||
|
"""Hook 8: Called before returning HTML"""
|
||||||
|
print(f" [Pipeline] 8/8 Before return - HTML length: {len(html):,} chars")
|
||||||
|
return page
|
||||||
|
|
||||||
|
print("🎯 Target URL: " + TEST_URLS[0])
|
||||||
|
print("🔧 Configured ALL 8 hook points for complete pipeline control\n")
|
||||||
|
|
||||||
|
client = Crawl4aiDockerClient(base_url=DOCKER_URL)
|
||||||
|
|
||||||
|
try:
|
||||||
|
print("🚀 Starting complete pipeline crawl...\n")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=[TEST_URLS[0]],
|
||||||
|
hooks={
|
||||||
|
"on_browser_created": on_browser_created_hook,
|
||||||
|
"on_page_context_created": on_page_context_created_hook,
|
||||||
|
"on_user_agent_updated": on_user_agent_updated_hook,
|
||||||
|
"before_goto": before_goto_hook,
|
||||||
|
"after_goto": after_goto_hook,
|
||||||
|
"on_execution_started": on_execution_started_hook,
|
||||||
|
"before_retrieve_html": before_retrieve_html_hook,
|
||||||
|
"before_return_html": before_return_html_hook,
|
||||||
|
},
|
||||||
|
hooks_timeout=45
|
||||||
|
)
|
||||||
|
|
||||||
|
execution_time = time.time() - start_time
|
||||||
|
|
||||||
|
if results and results.success:
|
||||||
|
print(f"\n✅ Complete pipeline executed successfully! (took {execution_time:.2f}s)")
|
||||||
|
print(f" • All 8 hooks executed in sequence")
|
||||||
|
print(f" • HTML length: {len(results.html):,} characters")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Pipeline completed with warnings")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {str(e)}")
|
||||||
|
|
||||||
|
print("\n📚 Available Hook Points:")
|
||||||
|
print(" 1. on_browser_created - Browser initialization")
|
||||||
|
print(" 2. on_page_context_created - Page context setup")
|
||||||
|
print(" 3. on_user_agent_updated - User agent configuration")
|
||||||
|
print(" 4. before_goto - Pre-navigation setup")
|
||||||
|
print(" 5. after_goto - Post-navigation processing")
|
||||||
|
print(" 6. on_execution_started - JavaScript execution start")
|
||||||
|
print(" 7. before_retrieve_html - Pre-extraction processing")
|
||||||
|
print(" 8. before_return_html - Final HTML processing")
|
||||||
|
|
||||||
|
print("\n" + "─" * 70)
|
||||||
|
print("✓ Complete hook pipeline demo complete\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MAIN EXECUTION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""
|
||||||
|
Run all demonstrations
|
||||||
|
"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print(" 🚀 Crawl4AI v0.7.5 - Docker Hooks Complete Demonstration")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Check Docker service
|
||||||
|
print("\n🔍 Checking Docker service status...")
|
||||||
|
if not check_docker_service():
|
||||||
|
print("❌ Docker service is not running!")
|
||||||
|
print("\n📋 To start the Docker service:")
|
||||||
|
print(" docker run -p 11235:11235 unclecode/crawl4ai:latest")
|
||||||
|
print("\nPlease start the service and run this demo again.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("✅ Docker service is running!\n")
|
||||||
|
|
||||||
|
# Run all demos
|
||||||
|
demos = [
|
||||||
|
("String-Based Hooks (REST API)", demo_1_string_based_hooks, False),
|
||||||
|
("hooks_to_string() Utility", demo_2_hooks_to_string_utility, False),
|
||||||
|
("Docker Client Auto-Conversion", demo_3_docker_client_auto_conversion, True),
|
||||||
|
# ("Complete Hook Pipeline", demo_4_complete_hook_pipeline, True),
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, (name, demo_func, is_async) in enumerate(demos, 1):
|
||||||
|
print(f"\n{'🔷' * 35}")
|
||||||
|
print(f"Starting Demo {i}/{len(demos)}: {name}")
|
||||||
|
print(f"{'🔷' * 35}\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if is_async:
|
||||||
|
await demo_func()
|
||||||
|
else:
|
||||||
|
demo_func()
|
||||||
|
|
||||||
|
print(f"✅ Demo {i} completed successfully!")
|
||||||
|
|
||||||
|
# Pause between demos (except the last one)
|
||||||
|
if i < len(demos):
|
||||||
|
print("\n⏸️ Press Enter to continue to next demo...")
|
||||||
|
# input()
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print(f"\n⏹️ Demo interrupted by user")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Demo {i} failed: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
print("\nContinuing to next demo...\n")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Final summary
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print(" 🎉 All Demonstrations Complete!")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
print("\n📊 Summary of v0.7.5 Docker Hooks System:")
|
||||||
|
print("\n🆕 COMPLETELY NEW FEATURE in v0.7.5:")
|
||||||
|
print(" The Docker Hooks System lets you customize the crawling pipeline")
|
||||||
|
print(" with user-provided Python functions at 8 strategic points.")
|
||||||
|
|
||||||
|
print("\n✨ Three Ways to Use Docker Hooks (All NEW!):")
|
||||||
|
print(" 1. String-based - Write hooks as strings for REST API")
|
||||||
|
print(" 2. hooks_to_string() - Convert Python functions to strings")
|
||||||
|
print(" 3. Docker Client - Automatic conversion (RECOMMENDED)")
|
||||||
|
|
||||||
|
print("\n💡 Key Benefits:")
|
||||||
|
print(" ✓ Full IDE support (autocomplete, syntax highlighting)")
|
||||||
|
print(" ✓ Type checking and linting")
|
||||||
|
print(" ✓ Easy to test and debug")
|
||||||
|
print(" ✓ Reusable across projects")
|
||||||
|
print(" ✓ Complete pipeline control")
|
||||||
|
|
||||||
|
print("\n🎯 8 Hook Points Available:")
|
||||||
|
print(" • on_browser_created, on_page_context_created")
|
||||||
|
print(" • on_user_agent_updated, before_goto, after_goto")
|
||||||
|
print(" • on_execution_started, before_retrieve_html, before_return_html")
|
||||||
|
|
||||||
|
print("\n📚 Resources:")
|
||||||
|
print(" • Docs: https://docs.crawl4ai.com")
|
||||||
|
print(" • GitHub: https://github.com/unclecode/crawl4ai")
|
||||||
|
print(" • Discord: https://discord.gg/jP8KfhDhyN")
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print(" Happy Crawling with v0.7.5! 🕷️")
|
||||||
|
print("=" * 70 + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("\n🎬 Starting Crawl4AI v0.7.5 Docker Hooks Demonstration...")
|
||||||
|
print("Press Ctrl+C anytime to exit\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(main())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\n👋 Demo stopped by user. Thanks for exploring Crawl4AI v0.7.5!")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n\n❌ Demo error: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
1516
docs/releases_review/v0.7.5_video_walkthrough.ipynb
Normal file
1516
docs/releases_review/v0.7.5_video_walkthrough.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
@@ -7,6 +7,7 @@ docs_dir: docs/md_v2
|
|||||||
|
|
||||||
nav:
|
nav:
|
||||||
- Home: 'index.md'
|
- Home: 'index.md'
|
||||||
|
- "📚 Complete SDK Reference": "complete-sdk-reference.md"
|
||||||
- "Ask AI": "core/ask-ai.md"
|
- "Ask AI": "core/ask-ai.md"
|
||||||
- "Quick Start": "core/quickstart.md"
|
- "Quick Start": "core/quickstart.md"
|
||||||
- "Code Examples": "core/examples.md"
|
- "Code Examples": "core/examples.md"
|
||||||
|
|||||||
401
test_llm_webhook_feature.py
Normal file
401
test_llm_webhook_feature.py
Normal file
@@ -0,0 +1,401 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script to validate webhook implementation for /llm/job endpoint.
|
||||||
|
|
||||||
|
This tests that the /llm/job endpoint now supports webhooks
|
||||||
|
following the same pattern as /crawl/job.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add deploy/docker to path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'deploy', 'docker'))
|
||||||
|
|
||||||
|
def test_llm_job_payload_model():
|
||||||
|
"""Test that LlmJobPayload includes webhook_config field"""
|
||||||
|
print("=" * 60)
|
||||||
|
print("TEST 1: LlmJobPayload Model")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from job import LlmJobPayload
|
||||||
|
from schemas import WebhookConfig
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
# Test with webhook_config
|
||||||
|
payload_dict = {
|
||||||
|
"url": "https://example.com",
|
||||||
|
"q": "Extract main content",
|
||||||
|
"schema": None,
|
||||||
|
"cache": False,
|
||||||
|
"provider": None,
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhook",
|
||||||
|
"webhook_data_in_payload": True,
|
||||||
|
"webhook_headers": {"X-Secret": "token"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = LlmJobPayload(**payload_dict)
|
||||||
|
|
||||||
|
print(f"✅ LlmJobPayload accepts webhook_config")
|
||||||
|
print(f" - URL: {payload.url}")
|
||||||
|
print(f" - Query: {payload.q}")
|
||||||
|
print(f" - Webhook URL: {payload.webhook_config.webhook_url}")
|
||||||
|
print(f" - Data in payload: {payload.webhook_config.webhook_data_in_payload}")
|
||||||
|
|
||||||
|
# Test without webhook_config (should be optional)
|
||||||
|
minimal_payload = {
|
||||||
|
"url": "https://example.com",
|
||||||
|
"q": "Extract content"
|
||||||
|
}
|
||||||
|
|
||||||
|
payload2 = LlmJobPayload(**minimal_payload)
|
||||||
|
assert payload2.webhook_config is None, "webhook_config should be optional"
|
||||||
|
print(f"✅ LlmJobPayload works without webhook_config (optional)")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_handle_llm_request_signature():
|
||||||
|
"""Test that handle_llm_request accepts webhook_config parameter"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 2: handle_llm_request Function Signature")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from api import handle_llm_request
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
sig = inspect.signature(handle_llm_request)
|
||||||
|
params = list(sig.parameters.keys())
|
||||||
|
|
||||||
|
print(f"Function parameters: {params}")
|
||||||
|
|
||||||
|
if 'webhook_config' in params:
|
||||||
|
print(f"✅ handle_llm_request has webhook_config parameter")
|
||||||
|
|
||||||
|
# Check that it's optional with default None
|
||||||
|
webhook_param = sig.parameters['webhook_config']
|
||||||
|
if webhook_param.default is None or webhook_param.default == inspect.Parameter.empty:
|
||||||
|
print(f"✅ webhook_config is optional (default: {webhook_param.default})")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ webhook_config default is: {webhook_param.default}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ handle_llm_request missing webhook_config parameter")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_process_llm_extraction_signature():
|
||||||
|
"""Test that process_llm_extraction accepts webhook_config parameter"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 3: process_llm_extraction Function Signature")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from api import process_llm_extraction
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
sig = inspect.signature(process_llm_extraction)
|
||||||
|
params = list(sig.parameters.keys())
|
||||||
|
|
||||||
|
print(f"Function parameters: {params}")
|
||||||
|
|
||||||
|
if 'webhook_config' in params:
|
||||||
|
print(f"✅ process_llm_extraction has webhook_config parameter")
|
||||||
|
|
||||||
|
webhook_param = sig.parameters['webhook_config']
|
||||||
|
if webhook_param.default is None or webhook_param.default == inspect.Parameter.empty:
|
||||||
|
print(f"✅ webhook_config is optional (default: {webhook_param.default})")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ webhook_config default is: {webhook_param.default}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ process_llm_extraction missing webhook_config parameter")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_webhook_integration_in_api():
|
||||||
|
"""Test that api.py properly integrates webhook notifications"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 4: Webhook Integration in process_llm_extraction")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||||
|
|
||||||
|
with open(api_file, 'r') as f:
|
||||||
|
api_content = f.read()
|
||||||
|
|
||||||
|
# Check for WebhookDeliveryService initialization
|
||||||
|
if 'webhook_service = WebhookDeliveryService(config)' in api_content:
|
||||||
|
print("✅ process_llm_extraction initializes WebhookDeliveryService")
|
||||||
|
else:
|
||||||
|
print("❌ Missing WebhookDeliveryService initialization in process_llm_extraction")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for notify_job_completion calls with llm_extraction
|
||||||
|
if 'task_type="llm_extraction"' in api_content:
|
||||||
|
print("✅ Uses correct task_type='llm_extraction' for notifications")
|
||||||
|
else:
|
||||||
|
print("❌ Missing task_type='llm_extraction' in webhook notifications")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Count webhook notification calls (should have at least 3: success + 2 failure paths)
|
||||||
|
notification_count = api_content.count('await webhook_service.notify_job_completion')
|
||||||
|
# Find only in process_llm_extraction function
|
||||||
|
llm_func_start = api_content.find('async def process_llm_extraction')
|
||||||
|
llm_func_end = api_content.find('\nasync def ', llm_func_start + 1)
|
||||||
|
if llm_func_end == -1:
|
||||||
|
llm_func_end = len(api_content)
|
||||||
|
|
||||||
|
llm_func_content = api_content[llm_func_start:llm_func_end]
|
||||||
|
llm_notification_count = llm_func_content.count('await webhook_service.notify_job_completion')
|
||||||
|
|
||||||
|
print(f"✅ Found {llm_notification_count} webhook notification calls in process_llm_extraction")
|
||||||
|
|
||||||
|
if llm_notification_count >= 3:
|
||||||
|
print(f"✅ Sufficient notification points (success + failure paths)")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Expected at least 3 notification calls, found {llm_notification_count}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_job_endpoint_integration():
|
||||||
|
"""Test that /llm/job endpoint extracts and passes webhook_config"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 5: /llm/job Endpoint Integration")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
job_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'job.py')
|
||||||
|
|
||||||
|
with open(job_file, 'r') as f:
|
||||||
|
job_content = f.read()
|
||||||
|
|
||||||
|
# Find the llm_job_enqueue function
|
||||||
|
llm_job_start = job_content.find('async def llm_job_enqueue')
|
||||||
|
llm_job_end = job_content.find('\n\n@router', llm_job_start + 1)
|
||||||
|
if llm_job_end == -1:
|
||||||
|
llm_job_end = job_content.find('\n\nasync def', llm_job_start + 1)
|
||||||
|
|
||||||
|
llm_job_func = job_content[llm_job_start:llm_job_end]
|
||||||
|
|
||||||
|
# Check for webhook_config extraction
|
||||||
|
if 'webhook_config = None' in llm_job_func:
|
||||||
|
print("✅ llm_job_enqueue initializes webhook_config variable")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config initialization")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'if payload.webhook_config:' in llm_job_func:
|
||||||
|
print("✅ llm_job_enqueue checks for payload.webhook_config")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config check")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'webhook_config = payload.webhook_config.model_dump(mode=\'json\')' in llm_job_func:
|
||||||
|
print("✅ llm_job_enqueue converts webhook_config to dict")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config.model_dump conversion")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'webhook_config=webhook_config' in llm_job_func:
|
||||||
|
print("✅ llm_job_enqueue passes webhook_config to handle_llm_request")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config parameter in handle_llm_request call")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_create_new_task_integration():
|
||||||
|
"""Test that create_new_task stores webhook_config in Redis"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 6: create_new_task Webhook Storage")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||||
|
|
||||||
|
with open(api_file, 'r') as f:
|
||||||
|
api_content = f.read()
|
||||||
|
|
||||||
|
# Find create_new_task function
|
||||||
|
create_task_start = api_content.find('async def create_new_task')
|
||||||
|
create_task_end = api_content.find('\nasync def ', create_task_start + 1)
|
||||||
|
if create_task_end == -1:
|
||||||
|
create_task_end = len(api_content)
|
||||||
|
|
||||||
|
create_task_func = api_content[create_task_start:create_task_end]
|
||||||
|
|
||||||
|
# Check for webhook_config storage
|
||||||
|
if 'if webhook_config:' in create_task_func:
|
||||||
|
print("✅ create_new_task checks for webhook_config")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config check in create_new_task")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'task_data["webhook_config"] = json.dumps(webhook_config)' in create_task_func:
|
||||||
|
print("✅ create_new_task stores webhook_config in Redis task data")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config storage in task_data")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check that webhook_config is passed to process_llm_extraction
|
||||||
|
if 'webhook_config' in create_task_func and 'background_tasks.add_task' in create_task_func:
|
||||||
|
print("✅ create_new_task passes webhook_config to background task")
|
||||||
|
else:
|
||||||
|
print("⚠️ Could not verify webhook_config passed to background task")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_pattern_consistency():
|
||||||
|
"""Test that /llm/job follows the same pattern as /crawl/job"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 7: Pattern Consistency with /crawl/job")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||||
|
|
||||||
|
with open(api_file, 'r') as f:
|
||||||
|
api_content = f.read()
|
||||||
|
|
||||||
|
# Find handle_crawl_job to compare pattern
|
||||||
|
crawl_job_start = api_content.find('async def handle_crawl_job')
|
||||||
|
crawl_job_end = api_content.find('\nasync def ', crawl_job_start + 1)
|
||||||
|
if crawl_job_end == -1:
|
||||||
|
crawl_job_end = len(api_content)
|
||||||
|
crawl_job_func = api_content[crawl_job_start:crawl_job_end]
|
||||||
|
|
||||||
|
# Find process_llm_extraction
|
||||||
|
llm_extract_start = api_content.find('async def process_llm_extraction')
|
||||||
|
llm_extract_end = api_content.find('\nasync def ', llm_extract_start + 1)
|
||||||
|
if llm_extract_end == -1:
|
||||||
|
llm_extract_end = len(api_content)
|
||||||
|
llm_extract_func = api_content[llm_extract_start:llm_extract_end]
|
||||||
|
|
||||||
|
print("Checking pattern consistency...")
|
||||||
|
|
||||||
|
# Both should initialize WebhookDeliveryService
|
||||||
|
crawl_has_service = 'webhook_service = WebhookDeliveryService(config)' in crawl_job_func
|
||||||
|
llm_has_service = 'webhook_service = WebhookDeliveryService(config)' in llm_extract_func
|
||||||
|
|
||||||
|
if crawl_has_service and llm_has_service:
|
||||||
|
print("✅ Both initialize WebhookDeliveryService")
|
||||||
|
else:
|
||||||
|
print(f"❌ Service initialization mismatch (crawl: {crawl_has_service}, llm: {llm_has_service})")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Both should call notify_job_completion on success
|
||||||
|
crawl_notifies_success = 'status="completed"' in crawl_job_func and 'notify_job_completion' in crawl_job_func
|
||||||
|
llm_notifies_success = 'status="completed"' in llm_extract_func and 'notify_job_completion' in llm_extract_func
|
||||||
|
|
||||||
|
if crawl_notifies_success and llm_notifies_success:
|
||||||
|
print("✅ Both notify on success")
|
||||||
|
else:
|
||||||
|
print(f"❌ Success notification mismatch (crawl: {crawl_notifies_success}, llm: {llm_notifies_success})")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Both should call notify_job_completion on failure
|
||||||
|
crawl_notifies_failure = 'status="failed"' in crawl_job_func and 'error=' in crawl_job_func
|
||||||
|
llm_notifies_failure = 'status="failed"' in llm_extract_func and 'error=' in llm_extract_func
|
||||||
|
|
||||||
|
if crawl_notifies_failure and llm_notifies_failure:
|
||||||
|
print("✅ Both notify on failure")
|
||||||
|
else:
|
||||||
|
print(f"❌ Failure notification mismatch (crawl: {crawl_notifies_failure}, llm: {llm_notifies_failure})")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("✅ /llm/job follows the same pattern as /crawl/job")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all tests"""
|
||||||
|
print("\n🧪 LLM Job Webhook Feature Validation")
|
||||||
|
print("=" * 60)
|
||||||
|
print("Testing that /llm/job now supports webhooks like /crawl/job")
|
||||||
|
print("=" * 60 + "\n")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Run all tests
|
||||||
|
results.append(("LlmJobPayload Model", test_llm_job_payload_model()))
|
||||||
|
results.append(("handle_llm_request Signature", test_handle_llm_request_signature()))
|
||||||
|
results.append(("process_llm_extraction Signature", test_process_llm_extraction_signature()))
|
||||||
|
results.append(("Webhook Integration", test_webhook_integration_in_api()))
|
||||||
|
results.append(("/llm/job Endpoint", test_job_endpoint_integration()))
|
||||||
|
results.append(("create_new_task Storage", test_create_new_task_integration()))
|
||||||
|
results.append(("Pattern Consistency", test_pattern_consistency()))
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
passed = sum(1 for _, result in results if result)
|
||||||
|
total = len(results)
|
||||||
|
|
||||||
|
for test_name, result in results:
|
||||||
|
status = "✅ PASS" if result else "❌ FAIL"
|
||||||
|
print(f"{status} - {test_name}")
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"Results: {passed}/{total} tests passed")
|
||||||
|
print(f"{'=' * 60}")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
print("\n🎉 All tests passed! /llm/job webhook feature is correctly implemented.")
|
||||||
|
print("\n📝 Summary of changes:")
|
||||||
|
print(" 1. LlmJobPayload model includes webhook_config field")
|
||||||
|
print(" 2. /llm/job endpoint extracts and passes webhook_config")
|
||||||
|
print(" 3. handle_llm_request accepts webhook_config parameter")
|
||||||
|
print(" 4. create_new_task stores webhook_config in Redis")
|
||||||
|
print(" 5. process_llm_extraction sends webhook notifications")
|
||||||
|
print(" 6. Follows the same pattern as /crawl/job")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
print(f"\n⚠️ {total - passed} test(s) failed. Please review the output above.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit(main())
|
||||||
307
test_webhook_implementation.py
Normal file
307
test_webhook_implementation.py
Normal file
@@ -0,0 +1,307 @@
|
|||||||
|
"""
|
||||||
|
Simple test script to validate webhook implementation without running full server.
|
||||||
|
|
||||||
|
This script tests:
|
||||||
|
1. Webhook module imports and syntax
|
||||||
|
2. WebhookDeliveryService initialization
|
||||||
|
3. Payload construction logic
|
||||||
|
4. Configuration parsing
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
# Add deploy/docker to path to import modules
|
||||||
|
# sys.path.insert(0, '/home/user/crawl4ai/deploy/docker')
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'deploy', 'docker'))
|
||||||
|
|
||||||
|
def test_imports():
|
||||||
|
"""Test that all webhook-related modules can be imported"""
|
||||||
|
print("=" * 60)
|
||||||
|
print("TEST 1: Module Imports")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from webhook import WebhookDeliveryService
|
||||||
|
print("✅ webhook.WebhookDeliveryService imported successfully")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed to import webhook module: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from schemas import WebhookConfig, WebhookPayload
|
||||||
|
print("✅ schemas.WebhookConfig imported successfully")
|
||||||
|
print("✅ schemas.WebhookPayload imported successfully")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed to import schemas: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def test_webhook_service_init():
|
||||||
|
"""Test WebhookDeliveryService initialization"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 2: WebhookDeliveryService Initialization")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from webhook import WebhookDeliveryService
|
||||||
|
|
||||||
|
# Test with default config
|
||||||
|
config = {
|
||||||
|
"webhooks": {
|
||||||
|
"enabled": True,
|
||||||
|
"default_url": None,
|
||||||
|
"data_in_payload": False,
|
||||||
|
"retry": {
|
||||||
|
"max_attempts": 5,
|
||||||
|
"initial_delay_ms": 1000,
|
||||||
|
"max_delay_ms": 32000,
|
||||||
|
"timeout_ms": 30000
|
||||||
|
},
|
||||||
|
"headers": {
|
||||||
|
"User-Agent": "Crawl4AI-Webhook/1.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
service = WebhookDeliveryService(config)
|
||||||
|
|
||||||
|
print(f"✅ Service initialized successfully")
|
||||||
|
print(f" - Max attempts: {service.max_attempts}")
|
||||||
|
print(f" - Initial delay: {service.initial_delay}s")
|
||||||
|
print(f" - Max delay: {service.max_delay}s")
|
||||||
|
print(f" - Timeout: {service.timeout}s")
|
||||||
|
|
||||||
|
# Verify calculations
|
||||||
|
assert service.max_attempts == 5, "Max attempts should be 5"
|
||||||
|
assert service.initial_delay == 1.0, "Initial delay should be 1.0s"
|
||||||
|
assert service.max_delay == 32.0, "Max delay should be 32.0s"
|
||||||
|
assert service.timeout == 30.0, "Timeout should be 30.0s"
|
||||||
|
|
||||||
|
print("✅ All configuration values correct")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Service initialization failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_webhook_config_model():
|
||||||
|
"""Test WebhookConfig Pydantic model"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 3: WebhookConfig Model Validation")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from schemas import WebhookConfig
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
# Test valid config
|
||||||
|
valid_config = {
|
||||||
|
"webhook_url": "https://example.com/webhook",
|
||||||
|
"webhook_data_in_payload": True,
|
||||||
|
"webhook_headers": {"X-Secret": "token123"}
|
||||||
|
}
|
||||||
|
|
||||||
|
config = WebhookConfig(**valid_config)
|
||||||
|
print(f"✅ Valid config accepted:")
|
||||||
|
print(f" - URL: {config.webhook_url}")
|
||||||
|
print(f" - Data in payload: {config.webhook_data_in_payload}")
|
||||||
|
print(f" - Headers: {config.webhook_headers}")
|
||||||
|
|
||||||
|
# Test minimal config
|
||||||
|
minimal_config = {
|
||||||
|
"webhook_url": "https://example.com/webhook"
|
||||||
|
}
|
||||||
|
|
||||||
|
config2 = WebhookConfig(**minimal_config)
|
||||||
|
print(f"✅ Minimal config accepted (defaults applied):")
|
||||||
|
print(f" - URL: {config2.webhook_url}")
|
||||||
|
print(f" - Data in payload: {config2.webhook_data_in_payload}")
|
||||||
|
print(f" - Headers: {config2.webhook_headers}")
|
||||||
|
|
||||||
|
# Test invalid URL
|
||||||
|
try:
|
||||||
|
invalid_config = {
|
||||||
|
"webhook_url": "not-a-url"
|
||||||
|
}
|
||||||
|
config3 = WebhookConfig(**invalid_config)
|
||||||
|
print(f"❌ Invalid URL should have been rejected")
|
||||||
|
return False
|
||||||
|
except ValidationError as e:
|
||||||
|
print(f"✅ Invalid URL correctly rejected")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Model validation test failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_payload_construction():
|
||||||
|
"""Test webhook payload construction logic"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 4: Payload Construction")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Simulate payload construction from notify_job_completion
|
||||||
|
task_id = "crawl_abc123"
|
||||||
|
task_type = "crawl"
|
||||||
|
status = "completed"
|
||||||
|
urls = ["https://example.com"]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"task_id": task_id,
|
||||||
|
"task_type": task_type,
|
||||||
|
"status": status,
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"urls": urls
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"✅ Basic payload constructed:")
|
||||||
|
print(json.dumps(payload, indent=2))
|
||||||
|
|
||||||
|
# Test with error
|
||||||
|
error_payload = {
|
||||||
|
"task_id": "crawl_xyz789",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "failed",
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"error": "Connection timeout"
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"\n✅ Error payload constructed:")
|
||||||
|
print(json.dumps(error_payload, indent=2))
|
||||||
|
|
||||||
|
# Test with data
|
||||||
|
data_payload = {
|
||||||
|
"task_id": "crawl_def456",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"data": {
|
||||||
|
"results": [
|
||||||
|
{"url": "https://example.com", "markdown": "# Example"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"\n✅ Data payload constructed:")
|
||||||
|
print(json.dumps(data_payload, indent=2))
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Payload construction failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_exponential_backoff():
|
||||||
|
"""Test exponential backoff calculation"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 5: Exponential Backoff Calculation")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
initial_delay = 1.0 # 1 second
|
||||||
|
max_delay = 32.0 # 32 seconds
|
||||||
|
|
||||||
|
print("Backoff delays for 5 attempts:")
|
||||||
|
for attempt in range(5):
|
||||||
|
delay = min(initial_delay * (2 ** attempt), max_delay)
|
||||||
|
print(f" Attempt {attempt + 1}: {delay}s")
|
||||||
|
|
||||||
|
# Verify the sequence: 1s, 2s, 4s, 8s, 16s
|
||||||
|
expected = [1.0, 2.0, 4.0, 8.0, 16.0]
|
||||||
|
actual = [min(initial_delay * (2 ** i), max_delay) for i in range(5)]
|
||||||
|
|
||||||
|
assert actual == expected, f"Expected {expected}, got {actual}"
|
||||||
|
print("✅ Exponential backoff sequence correct")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Backoff calculation failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_api_integration():
|
||||||
|
"""Test that api.py imports webhook module correctly"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 6: API Integration")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Check if api.py can import webhook module
|
||||||
|
api_path = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||||
|
with open(api_path, 'r') as f:
|
||||||
|
api_content = f.read()
|
||||||
|
|
||||||
|
if 'from webhook import WebhookDeliveryService' in api_content:
|
||||||
|
print("✅ api.py imports WebhookDeliveryService")
|
||||||
|
else:
|
||||||
|
print("❌ api.py missing webhook import")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'WebhookDeliveryService(config)' in api_content:
|
||||||
|
print("✅ api.py initializes WebhookDeliveryService")
|
||||||
|
else:
|
||||||
|
print("❌ api.py doesn't initialize WebhookDeliveryService")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'notify_job_completion' in api_content:
|
||||||
|
print("✅ api.py calls notify_job_completion")
|
||||||
|
else:
|
||||||
|
print("❌ api.py doesn't call notify_job_completion")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ API integration check failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all tests"""
|
||||||
|
print("\n🧪 Webhook Implementation Validation Tests")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
results.append(("Module Imports", test_imports()))
|
||||||
|
results.append(("Service Initialization", test_webhook_service_init()))
|
||||||
|
results.append(("Config Model", test_webhook_config_model()))
|
||||||
|
results.append(("Payload Construction", test_payload_construction()))
|
||||||
|
results.append(("Exponential Backoff", test_exponential_backoff()))
|
||||||
|
results.append(("API Integration", test_api_integration()))
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
passed = sum(1 for _, result in results if result)
|
||||||
|
total = len(results)
|
||||||
|
|
||||||
|
for test_name, result in results:
|
||||||
|
status = "✅ PASS" if result else "❌ FAIL"
|
||||||
|
print(f"{status} - {test_name}")
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"Results: {passed}/{total} tests passed")
|
||||||
|
print(f"{'=' * 60}")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
print("\n🎉 All tests passed! Webhook implementation is valid.")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
print(f"\n⚠️ {total - passed} test(s) failed. Please review the output above.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit(main())
|
||||||
251
tests/WEBHOOK_TEST_README.md
Normal file
251
tests/WEBHOOK_TEST_README.md
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
# Webhook Feature Test Script
|
||||||
|
|
||||||
|
This directory contains a comprehensive test script for the webhook feature implementation.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The `test_webhook_feature.sh` script automates the entire process of testing the webhook feature:
|
||||||
|
|
||||||
|
1. ✅ Fetches and switches to the webhook feature branch
|
||||||
|
2. ✅ Activates the virtual environment
|
||||||
|
3. ✅ Installs all required dependencies
|
||||||
|
4. ✅ Starts Redis server in background
|
||||||
|
5. ✅ Starts Crawl4AI server in background
|
||||||
|
6. ✅ Runs webhook integration test
|
||||||
|
7. ✅ Verifies job completion via webhook
|
||||||
|
8. ✅ Cleans up and returns to original branch
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Python 3.10+
|
||||||
|
- Virtual environment already created (`venv/` in project root)
|
||||||
|
- Git repository with the webhook feature branch
|
||||||
|
- `redis-server` (script will attempt to install if missing)
|
||||||
|
- `curl` and `lsof` commands available
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Quick Start
|
||||||
|
|
||||||
|
From the project root:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./tests/test_webhook_feature.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Or from the tests directory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd tests
|
||||||
|
./test_webhook_feature.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### What the Script Does
|
||||||
|
|
||||||
|
#### Step 1: Branch Management
|
||||||
|
- Saves your current branch
|
||||||
|
- Fetches the webhook feature branch from remote
|
||||||
|
- Switches to the webhook feature branch
|
||||||
|
|
||||||
|
#### Step 2: Environment Setup
|
||||||
|
- Activates your existing virtual environment
|
||||||
|
- Installs dependencies from `deploy/docker/requirements.txt`
|
||||||
|
- Installs Flask for the webhook receiver
|
||||||
|
|
||||||
|
#### Step 3: Service Startup
|
||||||
|
- Starts Redis server on port 6379
|
||||||
|
- Starts Crawl4AI server on port 11235
|
||||||
|
- Waits for server health check to pass
|
||||||
|
|
||||||
|
#### Step 4: Webhook Test
|
||||||
|
- Creates a webhook receiver on port 8080
|
||||||
|
- Submits a crawl job for `https://example.com` with webhook config
|
||||||
|
- Waits for webhook notification (60s timeout)
|
||||||
|
- Verifies webhook payload contains expected data
|
||||||
|
|
||||||
|
#### Step 5: Cleanup
|
||||||
|
- Stops webhook receiver
|
||||||
|
- Stops Crawl4AI server
|
||||||
|
- Stops Redis server
|
||||||
|
- Returns to your original branch
|
||||||
|
|
||||||
|
## Expected Output
|
||||||
|
|
||||||
|
```
|
||||||
|
[INFO] Starting webhook feature test script
|
||||||
|
[INFO] Project root: /path/to/crawl4ai
|
||||||
|
[INFO] Step 1: Fetching PR branch...
|
||||||
|
[INFO] Current branch: develop
|
||||||
|
[SUCCESS] Branch fetched
|
||||||
|
[INFO] Step 2: Switching to branch: claude/implement-webhook-crawl-feature-011CULZY1Jy8N5MUkZqXkRVp
|
||||||
|
[SUCCESS] Switched to webhook feature branch
|
||||||
|
[INFO] Step 3: Activating virtual environment...
|
||||||
|
[SUCCESS] Virtual environment activated
|
||||||
|
[INFO] Step 4: Installing server dependencies...
|
||||||
|
[SUCCESS] Dependencies installed
|
||||||
|
[INFO] Step 5a: Starting Redis...
|
||||||
|
[SUCCESS] Redis started (PID: 12345)
|
||||||
|
[INFO] Step 5b: Starting server on port 11235...
|
||||||
|
[INFO] Server started (PID: 12346)
|
||||||
|
[INFO] Waiting for server to be ready...
|
||||||
|
[SUCCESS] Server is ready!
|
||||||
|
[INFO] Step 6: Creating webhook test script...
|
||||||
|
[INFO] Running webhook test...
|
||||||
|
|
||||||
|
🚀 Submitting crawl job with webhook...
|
||||||
|
✅ Job submitted successfully, task_id: crawl_abc123
|
||||||
|
⏳ Waiting for webhook notification...
|
||||||
|
|
||||||
|
✅ Webhook received: {
|
||||||
|
"task_id": "crawl_abc123",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-22T00:00:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"data": { ... }
|
||||||
|
}
|
||||||
|
|
||||||
|
✅ Webhook received!
|
||||||
|
Task ID: crawl_abc123
|
||||||
|
Status: completed
|
||||||
|
URLs: ['https://example.com']
|
||||||
|
✅ Data included in webhook payload
|
||||||
|
📄 Crawled 1 URL(s)
|
||||||
|
- https://example.com: 1234 chars
|
||||||
|
|
||||||
|
🎉 Webhook test PASSED!
|
||||||
|
|
||||||
|
[INFO] Step 7: Verifying test results...
|
||||||
|
[SUCCESS] ✅ Webhook test PASSED!
|
||||||
|
[SUCCESS] All tests completed successfully! 🎉
|
||||||
|
[INFO] Cleanup will happen automatically...
|
||||||
|
[INFO] Starting cleanup...
|
||||||
|
[INFO] Stopping webhook receiver...
|
||||||
|
[INFO] Stopping server...
|
||||||
|
[INFO] Stopping Redis...
|
||||||
|
[INFO] Switching back to branch: develop
|
||||||
|
[SUCCESS] Cleanup complete
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Server Failed to Start
|
||||||
|
|
||||||
|
If the server fails to start, check the logs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tail -100 /tmp/crawl4ai_server.log
|
||||||
|
```
|
||||||
|
|
||||||
|
Common issues:
|
||||||
|
- Port 11235 already in use: `lsof -ti:11235 | xargs kill -9`
|
||||||
|
- Missing dependencies: Check that all packages are installed
|
||||||
|
|
||||||
|
### Redis Connection Failed
|
||||||
|
|
||||||
|
Check if Redis is running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
redis-cli ping
|
||||||
|
# Should return: PONG
|
||||||
|
```
|
||||||
|
|
||||||
|
If not running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
redis-server --port 6379 --daemonize yes
|
||||||
|
```
|
||||||
|
|
||||||
|
### Webhook Not Received
|
||||||
|
|
||||||
|
The script has a 60-second timeout for webhook delivery. If the webhook isn't received:
|
||||||
|
|
||||||
|
1. Check server logs: `/tmp/crawl4ai_server.log`
|
||||||
|
2. Verify webhook receiver is running on port 8080
|
||||||
|
3. Check network connectivity between components
|
||||||
|
|
||||||
|
### Script Interruption
|
||||||
|
|
||||||
|
If the script is interrupted (Ctrl+C), cleanup happens automatically via trap. The script will:
|
||||||
|
- Kill all background processes
|
||||||
|
- Stop Redis
|
||||||
|
- Return to your original branch
|
||||||
|
|
||||||
|
To manually cleanup if needed:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Kill processes by port
|
||||||
|
lsof -ti:11235 | xargs kill -9 # Server
|
||||||
|
lsof -ti:8080 | xargs kill -9 # Webhook receiver
|
||||||
|
lsof -ti:6379 | xargs kill -9 # Redis
|
||||||
|
|
||||||
|
# Return to your branch
|
||||||
|
git checkout develop # or your branch name
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing Different URLs
|
||||||
|
|
||||||
|
To test with a different URL, modify the script or create a custom test:
|
||||||
|
|
||||||
|
```python
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://your-url-here.com"],
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "http://localhost:8080/webhook",
|
||||||
|
"webhook_data_in_payload": True
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Files Generated
|
||||||
|
|
||||||
|
The script creates temporary files:
|
||||||
|
|
||||||
|
- `/tmp/crawl4ai_server.log` - Server output logs
|
||||||
|
- `/tmp/test_webhook.py` - Webhook test Python script
|
||||||
|
|
||||||
|
These are not cleaned up automatically so you can review them after the test.
|
||||||
|
|
||||||
|
## Exit Codes
|
||||||
|
|
||||||
|
- `0` - All tests passed successfully
|
||||||
|
- `1` - Test failed (check output for details)
|
||||||
|
|
||||||
|
## Safety Features
|
||||||
|
|
||||||
|
- ✅ Automatic cleanup on exit, interrupt, or error
|
||||||
|
- ✅ Returns to original branch on completion
|
||||||
|
- ✅ Kills all background processes
|
||||||
|
- ✅ Comprehensive error handling
|
||||||
|
- ✅ Colored output for easy reading
|
||||||
|
- ✅ Detailed logging at each step
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The script uses `set -e` to exit on any command failure
|
||||||
|
- All background processes are tracked and cleaned up
|
||||||
|
- The virtual environment must exist before running
|
||||||
|
- Redis must be available (installed or installable via apt-get/brew)
|
||||||
|
|
||||||
|
## Integration with CI/CD
|
||||||
|
|
||||||
|
This script can be integrated into CI/CD pipelines:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Example GitHub Actions
|
||||||
|
- name: Test Webhook Feature
|
||||||
|
run: |
|
||||||
|
chmod +x tests/test_webhook_feature.sh
|
||||||
|
./tests/test_webhook_feature.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
If you encounter issues:
|
||||||
|
|
||||||
|
1. Check the troubleshooting section above
|
||||||
|
2. Review server logs at `/tmp/crawl4ai_server.log`
|
||||||
|
3. Ensure all prerequisites are met
|
||||||
|
4. Open an issue with the full output of the script
|
||||||
193
tests/docker/test_hooks_utility.py
Normal file
193
tests/docker/test_hooks_utility.py
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
"""
|
||||||
|
Test script demonstrating the hooks_to_string utility and Docker client integration.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import Crawl4aiDockerClient, hooks_to_string
|
||||||
|
|
||||||
|
|
||||||
|
# Define hook functions as regular Python functions
|
||||||
|
async def auth_hook(page, context, **kwargs):
|
||||||
|
"""Add authentication cookies."""
|
||||||
|
await context.add_cookies([{
|
||||||
|
'name': 'test_cookie',
|
||||||
|
'value': 'test_value',
|
||||||
|
'domain': '.httpbin.org',
|
||||||
|
'path': '/'
|
||||||
|
}])
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def scroll_hook(page, context, **kwargs):
|
||||||
|
"""Scroll to load lazy content."""
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def viewport_hook(page, context, **kwargs):
|
||||||
|
"""Set custom viewport."""
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def test_hooks_utility():
|
||||||
|
"""Test the hooks_to_string utility function."""
|
||||||
|
print("=" * 60)
|
||||||
|
print("Testing hooks_to_string utility")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Create hooks dictionary with function objects
|
||||||
|
hooks_dict = {
|
||||||
|
"on_page_context_created": auth_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert to string format
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
print("\n✓ Successfully converted function objects to strings")
|
||||||
|
print(f"\n✓ Converted {len(hooks_string)} hooks:")
|
||||||
|
for hook_name in hooks_string.keys():
|
||||||
|
print(f" - {hook_name}")
|
||||||
|
|
||||||
|
print("\n✓ Preview of converted hook:")
|
||||||
|
print("-" * 60)
|
||||||
|
print(hooks_string["on_page_context_created"][:200] + "...")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
return hooks_string
|
||||||
|
|
||||||
|
|
||||||
|
async def test_docker_client_with_functions():
|
||||||
|
"""Test Docker client with function objects (automatic conversion)."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Testing Docker Client with Function Objects")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Note: This requires a running Crawl4AI Docker server
|
||||||
|
# Uncomment the following to test with actual server:
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
|
||||||
|
# Pass function objects directly - they'll be converted automatically
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": auth_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
print(f"\n✓ Crawl successful: {result.success}")
|
||||||
|
print(f"✓ URL: {result.url}")
|
||||||
|
|
||||||
|
print("\n✓ Docker client accepts function objects directly")
|
||||||
|
print("✓ Automatic conversion happens internally")
|
||||||
|
print("✓ No manual string formatting needed!")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_docker_client_with_strings():
|
||||||
|
"""Test Docker client with pre-converted strings."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Testing Docker Client with String Hooks")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Convert hooks to strings first
|
||||||
|
hooks_dict = {
|
||||||
|
"on_page_context_created": viewport_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
}
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
# Note: This requires a running Crawl4AI Docker server
|
||||||
|
# Uncomment the following to test with actual server:
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
|
||||||
|
# Pass string hooks - they'll be used as-is
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks=hooks_string,
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
print(f"\n✓ Crawl successful: {result.success}")
|
||||||
|
|
||||||
|
print("\n✓ Docker client also accepts pre-converted strings")
|
||||||
|
print("✓ Backward compatible with existing code")
|
||||||
|
|
||||||
|
|
||||||
|
async def show_usage_patterns():
|
||||||
|
"""Show different usage patterns."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Usage Patterns")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
print("\n1. Direct function usage (simplest):")
|
||||||
|
print("-" * 60)
|
||||||
|
print("""
|
||||||
|
async def my_hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks={"on_page_context_created": my_hook}
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
print("\n2. Convert then use:")
|
||||||
|
print("-" * 60)
|
||||||
|
print("""
|
||||||
|
hooks_dict = {"on_page_context_created": my_hook}
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks=hooks_string
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
print("\n3. Manual string (backward compatible):")
|
||||||
|
print("-" * 60)
|
||||||
|
print("""
|
||||||
|
hooks_string = {
|
||||||
|
"on_page_context_created": '''
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
'''
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks=hooks_string
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all tests."""
|
||||||
|
print("\n🚀 Crawl4AI Hooks Utility Test Suite\n")
|
||||||
|
|
||||||
|
# Test the utility function
|
||||||
|
# await test_hooks_utility()
|
||||||
|
|
||||||
|
# Show usage with Docker client
|
||||||
|
# await test_docker_client_with_functions()
|
||||||
|
await test_docker_client_with_strings()
|
||||||
|
|
||||||
|
# Show different patterns
|
||||||
|
# await show_usage_patterns()
|
||||||
|
|
||||||
|
# print("\n" + "=" * 60)
|
||||||
|
# print("✓ All tests completed successfully!")
|
||||||
|
# print("=" * 60)
|
||||||
|
# print("\nKey Benefits:")
|
||||||
|
# print(" • Write hooks as regular Python functions")
|
||||||
|
# print(" • IDE support with autocomplete and type checking")
|
||||||
|
# print(" • Automatic conversion to API format")
|
||||||
|
# print(" • Backward compatible with string hooks")
|
||||||
|
# print(" • Same utility used everywhere")
|
||||||
|
# print("\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
305
tests/test_webhook_feature.sh
Executable file
305
tests/test_webhook_feature.sh
Executable file
@@ -0,0 +1,305 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# Webhook Feature Test Script
|
||||||
|
#
|
||||||
|
# This script tests the webhook feature implementation by:
|
||||||
|
# 1. Switching to the webhook feature branch
|
||||||
|
# 2. Installing dependencies
|
||||||
|
# 3. Starting the server
|
||||||
|
# 4. Running webhook tests
|
||||||
|
# 5. Cleaning up and returning to original branch
|
||||||
|
#
|
||||||
|
# Usage: ./test_webhook_feature.sh
|
||||||
|
#############################################################################
|
||||||
|
|
||||||
|
set -e # Exit on error
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
BRANCH_NAME="claude/implement-webhook-crawl-feature-011CULZY1Jy8N5MUkZqXkRVp"
|
||||||
|
VENV_PATH="venv"
|
||||||
|
SERVER_PORT=11235
|
||||||
|
WEBHOOK_PORT=8080
|
||||||
|
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||||
|
|
||||||
|
# PID files for cleanup
|
||||||
|
REDIS_PID=""
|
||||||
|
SERVER_PID=""
|
||||||
|
WEBHOOK_PID=""
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# Utility Functions
|
||||||
|
#############################################################################
|
||||||
|
|
||||||
|
log_info() {
|
||||||
|
echo -e "${BLUE}[INFO]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_success() {
|
||||||
|
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warning() {
|
||||||
|
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_error() {
|
||||||
|
echo -e "${RED}[ERROR]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
log_info "Starting cleanup..."
|
||||||
|
|
||||||
|
# Kill webhook receiver if running
|
||||||
|
if [ ! -z "$WEBHOOK_PID" ] && kill -0 $WEBHOOK_PID 2>/dev/null; then
|
||||||
|
log_info "Stopping webhook receiver (PID: $WEBHOOK_PID)..."
|
||||||
|
kill $WEBHOOK_PID 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Kill server if running
|
||||||
|
if [ ! -z "$SERVER_PID" ] && kill -0 $SERVER_PID 2>/dev/null; then
|
||||||
|
log_info "Stopping server (PID: $SERVER_PID)..."
|
||||||
|
kill $SERVER_PID 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Kill Redis if running
|
||||||
|
if [ ! -z "$REDIS_PID" ] && kill -0 $REDIS_PID 2>/dev/null; then
|
||||||
|
log_info "Stopping Redis (PID: $REDIS_PID)..."
|
||||||
|
kill $REDIS_PID 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Also kill by port if PIDs didn't work
|
||||||
|
lsof -ti:$SERVER_PORT | xargs kill -9 2>/dev/null || true
|
||||||
|
lsof -ti:$WEBHOOK_PORT | xargs kill -9 2>/dev/null || true
|
||||||
|
lsof -ti:6379 | xargs kill -9 2>/dev/null || true
|
||||||
|
|
||||||
|
# Return to original branch
|
||||||
|
if [ ! -z "$ORIGINAL_BRANCH" ]; then
|
||||||
|
log_info "Switching back to branch: $ORIGINAL_BRANCH"
|
||||||
|
git checkout $ORIGINAL_BRANCH 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_success "Cleanup complete"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set trap to cleanup on exit
|
||||||
|
trap cleanup EXIT INT TERM
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# Main Script
|
||||||
|
#############################################################################
|
||||||
|
|
||||||
|
log_info "Starting webhook feature test script"
|
||||||
|
log_info "Project root: $PROJECT_ROOT"
|
||||||
|
|
||||||
|
cd "$PROJECT_ROOT"
|
||||||
|
|
||||||
|
# Step 1: Save current branch and fetch PR
|
||||||
|
log_info "Step 1: Fetching PR branch..."
|
||||||
|
ORIGINAL_BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
||||||
|
log_info "Current branch: $ORIGINAL_BRANCH"
|
||||||
|
|
||||||
|
git fetch origin $BRANCH_NAME
|
||||||
|
log_success "Branch fetched"
|
||||||
|
|
||||||
|
# Step 2: Switch to new branch
|
||||||
|
log_info "Step 2: Switching to branch: $BRANCH_NAME"
|
||||||
|
git checkout $BRANCH_NAME
|
||||||
|
log_success "Switched to webhook feature branch"
|
||||||
|
|
||||||
|
# Step 3: Activate virtual environment
|
||||||
|
log_info "Step 3: Activating virtual environment..."
|
||||||
|
if [ ! -d "$VENV_PATH" ]; then
|
||||||
|
log_error "Virtual environment not found at $VENV_PATH"
|
||||||
|
log_info "Creating virtual environment..."
|
||||||
|
python3 -m venv $VENV_PATH
|
||||||
|
fi
|
||||||
|
|
||||||
|
source $VENV_PATH/bin/activate
|
||||||
|
log_success "Virtual environment activated: $(which python)"
|
||||||
|
|
||||||
|
# Step 4: Install server dependencies
|
||||||
|
log_info "Step 4: Installing server dependencies..."
|
||||||
|
pip install -q -r deploy/docker/requirements.txt
|
||||||
|
log_success "Dependencies installed"
|
||||||
|
|
||||||
|
# Check if Redis is available
|
||||||
|
log_info "Checking Redis availability..."
|
||||||
|
if ! command -v redis-server &> /dev/null; then
|
||||||
|
log_warning "Redis not found, attempting to install..."
|
||||||
|
if command -v apt-get &> /dev/null; then
|
||||||
|
sudo apt-get update && sudo apt-get install -y redis-server
|
||||||
|
elif command -v brew &> /dev/null; then
|
||||||
|
brew install redis
|
||||||
|
else
|
||||||
|
log_error "Cannot install Redis automatically. Please install Redis manually."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 5: Start Redis in background
|
||||||
|
log_info "Step 5a: Starting Redis..."
|
||||||
|
redis-server --port 6379 --daemonize yes
|
||||||
|
sleep 2
|
||||||
|
REDIS_PID=$(pgrep redis-server)
|
||||||
|
log_success "Redis started (PID: $REDIS_PID)"
|
||||||
|
|
||||||
|
# Step 5b: Start server in background
|
||||||
|
log_info "Step 5b: Starting server on port $SERVER_PORT..."
|
||||||
|
cd deploy/docker
|
||||||
|
|
||||||
|
# Start server in background
|
||||||
|
python3 -m uvicorn server:app --host 0.0.0.0 --port $SERVER_PORT > /tmp/crawl4ai_server.log 2>&1 &
|
||||||
|
SERVER_PID=$!
|
||||||
|
cd "$PROJECT_ROOT"
|
||||||
|
|
||||||
|
log_info "Server started (PID: $SERVER_PID)"
|
||||||
|
|
||||||
|
# Wait for server to be ready
|
||||||
|
log_info "Waiting for server to be ready..."
|
||||||
|
for i in {1..30}; do
|
||||||
|
if curl -s http://localhost:$SERVER_PORT/health > /dev/null 2>&1; then
|
||||||
|
log_success "Server is ready!"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
if [ $i -eq 30 ]; then
|
||||||
|
log_error "Server failed to start within 30 seconds"
|
||||||
|
log_info "Server logs:"
|
||||||
|
tail -50 /tmp/crawl4ai_server.log
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo -n "."
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Step 6: Create and run webhook test
|
||||||
|
log_info "Step 6: Creating webhook test script..."
|
||||||
|
|
||||||
|
cat > /tmp/test_webhook.py << 'PYTHON_SCRIPT'
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
from threading import Thread, Event
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CRAWL4AI_BASE_URL = "http://localhost:11235"
|
||||||
|
WEBHOOK_BASE_URL = "http://localhost:8080"
|
||||||
|
|
||||||
|
# Flask app for webhook receiver
|
||||||
|
app = Flask(__name__)
|
||||||
|
webhook_received = Event()
|
||||||
|
webhook_data = {}
|
||||||
|
|
||||||
|
@app.route('/webhook', methods=['POST'])
|
||||||
|
def handle_webhook():
|
||||||
|
global webhook_data
|
||||||
|
webhook_data = request.json
|
||||||
|
webhook_received.set()
|
||||||
|
print(f"\n✅ Webhook received: {json.dumps(webhook_data, indent=2)}")
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
def start_webhook_server():
|
||||||
|
app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
|
||||||
|
|
||||||
|
# Start webhook server in background
|
||||||
|
webhook_thread = Thread(target=start_webhook_server, daemon=True)
|
||||||
|
webhook_thread.start()
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
print("🚀 Submitting crawl job with webhook...")
|
||||||
|
|
||||||
|
# Submit job with webhook
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
||||||
|
"webhook_data_in_payload": True
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{CRAWL4AI_BASE_URL}/crawl/job",
|
||||||
|
json=payload,
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response.ok:
|
||||||
|
print(f"❌ Failed to submit job: {response.text}")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
task_id = response.json()['task_id']
|
||||||
|
print(f"✅ Job submitted successfully, task_id: {task_id}")
|
||||||
|
|
||||||
|
# Wait for webhook (with timeout)
|
||||||
|
print("⏳ Waiting for webhook notification...")
|
||||||
|
if webhook_received.wait(timeout=60):
|
||||||
|
print(f"✅ Webhook received!")
|
||||||
|
print(f" Task ID: {webhook_data.get('task_id')}")
|
||||||
|
print(f" Status: {webhook_data.get('status')}")
|
||||||
|
print(f" URLs: {webhook_data.get('urls')}")
|
||||||
|
|
||||||
|
if webhook_data.get('status') == 'completed':
|
||||||
|
if 'data' in webhook_data:
|
||||||
|
print(f" ✅ Data included in webhook payload")
|
||||||
|
results = webhook_data['data'].get('results', [])
|
||||||
|
if results:
|
||||||
|
print(f" 📄 Crawled {len(results)} URL(s)")
|
||||||
|
for result in results:
|
||||||
|
print(f" - {result.get('url')}: {len(result.get('markdown', ''))} chars")
|
||||||
|
print("\n🎉 Webhook test PASSED!")
|
||||||
|
exit(0)
|
||||||
|
else:
|
||||||
|
print(f" ❌ Job failed: {webhook_data.get('error')}")
|
||||||
|
exit(1)
|
||||||
|
else:
|
||||||
|
print("❌ Webhook not received within 60 seconds")
|
||||||
|
# Try polling as fallback
|
||||||
|
print("⏳ Trying to poll job status...")
|
||||||
|
for i in range(10):
|
||||||
|
status_response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
|
||||||
|
if status_response.ok:
|
||||||
|
status = status_response.json()
|
||||||
|
print(f" Status: {status.get('status')}")
|
||||||
|
if status.get('status') in ['completed', 'failed']:
|
||||||
|
break
|
||||||
|
time.sleep(2)
|
||||||
|
exit(1)
|
||||||
|
PYTHON_SCRIPT
|
||||||
|
|
||||||
|
# Install Flask for webhook receiver
|
||||||
|
pip install -q flask
|
||||||
|
|
||||||
|
# Run the webhook test
|
||||||
|
log_info "Running webhook test..."
|
||||||
|
python3 /tmp/test_webhook.py &
|
||||||
|
WEBHOOK_PID=$!
|
||||||
|
|
||||||
|
# Wait for test to complete
|
||||||
|
wait $WEBHOOK_PID
|
||||||
|
TEST_EXIT_CODE=$?
|
||||||
|
|
||||||
|
# Step 7: Verify results
|
||||||
|
log_info "Step 7: Verifying test results..."
|
||||||
|
if [ $TEST_EXIT_CODE -eq 0 ]; then
|
||||||
|
log_success "✅ Webhook test PASSED!"
|
||||||
|
else
|
||||||
|
log_error "❌ Webhook test FAILED (exit code: $TEST_EXIT_CODE)"
|
||||||
|
log_info "Server logs:"
|
||||||
|
tail -100 /tmp/crawl4ai_server.log
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 8: Cleanup happens automatically via trap
|
||||||
|
log_success "All tests completed successfully! 🎉"
|
||||||
|
log_info "Cleanup will happen automatically..."
|
||||||
Reference in New Issue
Block a user