feat: Add hooks utility for function-based hooks with Docker client integration. ref #1377
Add hooks_to_string() utility function that converts Python function objects to string representations for the Docker API, enabling developers to write hooks as regular Python functions instead of strings. Core Changes: - New hooks_to_string() utility in crawl4ai/utils.py using inspect.getsource() - Docker client now accepts both function objects and strings for hooks - Automatic detection and conversion in Crawl4aiDockerClient._prepare_request() - New hooks and hooks_timeout parameters in client.crawl() method Documentation: - Docker client examples with function-based hooks (docs/examples/docker_client_hooks_example.py) - Updated main Docker deployment guide with comprehensive hooks section - Added unit tests for hooks utility (tests/docker/test_hooks_utility.py)
This commit is contained in:
@@ -103,7 +103,8 @@ from .browser_adapter import (
|
||||
|
||||
from .utils import (
|
||||
start_colab_display_server,
|
||||
setup_colab_environment
|
||||
setup_colab_environment,
|
||||
hooks_to_string
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
@@ -183,6 +184,7 @@ __all__ = [
|
||||
"ProxyConfig",
|
||||
"start_colab_display_server",
|
||||
"setup_colab_environment",
|
||||
"hooks_to_string",
|
||||
# C4A Script additions
|
||||
"c4a_compile",
|
||||
"c4a_validate",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import List, Optional, Union, AsyncGenerator, Dict, Any
|
||||
from typing import List, Optional, Union, AsyncGenerator, Dict, Any, Callable
|
||||
import httpx
|
||||
import json
|
||||
from urllib.parse import urljoin
|
||||
@@ -7,6 +7,7 @@ import asyncio
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .models import CrawlResult
|
||||
from .async_logger import AsyncLogger, LogLevel
|
||||
from .utils import hooks_to_string
|
||||
|
||||
|
||||
class Crawl4aiClientError(Exception):
|
||||
@@ -70,17 +71,41 @@ class Crawl4aiDockerClient:
|
||||
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
|
||||
raise ConnectionError(f"Cannot connect to server: {str(e)}")
|
||||
|
||||
def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
|
||||
crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
|
||||
def _prepare_request(
|
||||
self,
|
||||
urls: List[str],
|
||||
browser_config: Optional[BrowserConfig] = None,
|
||||
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
||||
hooks_timeout: int = 30
|
||||
) -> Dict[str, Any]:
|
||||
"""Prepare request data from configs."""
|
||||
if self._token:
|
||||
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
||||
return {
|
||||
|
||||
request_data = {
|
||||
"urls": urls,
|
||||
"browser_config": browser_config.dump() if browser_config else {},
|
||||
"crawler_config": crawler_config.dump() if crawler_config else {}
|
||||
}
|
||||
|
||||
# Handle hooks if provided
|
||||
if hooks:
|
||||
# Check if hooks are already strings or need conversion
|
||||
if any(callable(v) for v in hooks.values()):
|
||||
# Convert function objects to strings
|
||||
hooks_code = hooks_to_string(hooks)
|
||||
else:
|
||||
# Already in string format
|
||||
hooks_code = hooks
|
||||
|
||||
request_data["hooks"] = {
|
||||
"code": hooks_code,
|
||||
"timeout": hooks_timeout
|
||||
}
|
||||
|
||||
return request_data
|
||||
|
||||
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
|
||||
"""Make an HTTP request with error handling."""
|
||||
url = urljoin(self.base_url, endpoint)
|
||||
@@ -102,16 +127,42 @@ class Crawl4aiDockerClient:
|
||||
self,
|
||||
urls: List[str],
|
||||
browser_config: Optional[BrowserConfig] = None,
|
||||
crawler_config: Optional[CrawlerRunConfig] = None
|
||||
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
||||
hooks_timeout: int = 30
|
||||
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||
"""Execute a crawl operation."""
|
||||
"""
|
||||
Execute a crawl operation.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to crawl
|
||||
browser_config: Browser configuration
|
||||
crawler_config: Crawler configuration
|
||||
hooks: Optional hooks - can be either:
|
||||
- Dict[str, Callable]: Function objects that will be converted to strings
|
||||
- Dict[str, str]: Already stringified hook code
|
||||
hooks_timeout: Timeout in seconds for each hook execution (1-120)
|
||||
|
||||
Returns:
|
||||
Single CrawlResult, list of results, or async generator for streaming
|
||||
|
||||
Example with function hooks:
|
||||
>>> async def my_hook(page, context, **kwargs):
|
||||
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
... return page
|
||||
>>>
|
||||
>>> result = await client.crawl(
|
||||
... ["https://example.com"],
|
||||
... hooks={"on_page_context_created": my_hook}
|
||||
... )
|
||||
"""
|
||||
await self._check_server()
|
||||
|
||||
data = self._prepare_request(urls, browser_config, crawler_config)
|
||||
|
||||
data = self._prepare_request(urls, browser_config, crawler_config, hooks, hooks_timeout)
|
||||
is_streaming = crawler_config and crawler_config.stream
|
||||
|
||||
|
||||
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
|
||||
|
||||
|
||||
if is_streaming:
|
||||
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
|
||||
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
|
||||
@@ -128,12 +179,12 @@ class Crawl4aiDockerClient:
|
||||
else:
|
||||
yield CrawlResult(**result)
|
||||
return stream_results()
|
||||
|
||||
|
||||
response = await self._request("POST", "/crawl", json=data)
|
||||
result_data = response.json()
|
||||
if not result_data.get("success", False):
|
||||
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
|
||||
|
||||
|
||||
results = [CrawlResult(**r) for r in result_data.get("results", [])]
|
||||
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
|
||||
return results[0] if len(results) == 1 else results
|
||||
|
||||
@@ -47,6 +47,7 @@ from urllib.parse import (
|
||||
urljoin, urlparse, urlunparse,
|
||||
parse_qsl, urlencode, quote, unquote
|
||||
)
|
||||
import inspect
|
||||
|
||||
|
||||
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||
@@ -3529,4 +3530,52 @@ def get_memory_stats() -> Tuple[float, float, float]:
|
||||
available_gb = get_true_available_memory_gb()
|
||||
used_percent = get_true_memory_usage_percent()
|
||||
|
||||
return used_percent, available_gb, total_gb
|
||||
return used_percent, available_gb, total_gb
|
||||
|
||||
|
||||
# Hook utilities for Docker API
|
||||
def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]:
|
||||
"""
|
||||
Convert hook function objects to string representations for Docker API.
|
||||
|
||||
This utility simplifies the process of using hooks with the Docker API by converting
|
||||
Python function objects into the string format required by the API.
|
||||
|
||||
Args:
|
||||
hooks: Dictionary mapping hook point names to Python function objects.
|
||||
Functions should be async and follow hook signature requirements.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping hook point names to string representations of the functions.
|
||||
|
||||
Example:
|
||||
>>> async def my_hook(page, context, **kwargs):
|
||||
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
... return page
|
||||
>>>
|
||||
>>> hooks_dict = {"on_page_context_created": my_hook}
|
||||
>>> api_hooks = hooks_to_string(hooks_dict)
|
||||
>>> # api_hooks is now ready to use with Docker API
|
||||
|
||||
Raises:
|
||||
ValueError: If a hook is not callable or source cannot be extracted
|
||||
"""
|
||||
result = {}
|
||||
|
||||
for hook_name, hook_func in hooks.items():
|
||||
if not callable(hook_func):
|
||||
raise ValueError(f"Hook '{hook_name}' must be a callable function, got {type(hook_func)}")
|
||||
|
||||
try:
|
||||
# Get the source code of the function
|
||||
source = inspect.getsource(hook_func)
|
||||
# Remove any leading indentation to get clean source
|
||||
source = textwrap.dedent(source)
|
||||
result[hook_name] = source
|
||||
except (OSError, TypeError) as e:
|
||||
raise ValueError(
|
||||
f"Cannot extract source code for hook '{hook_name}'. "
|
||||
f"Make sure the function is defined in a file (not interactively). Error: {e}"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user